diff options
Diffstat (limited to 'fs')
119 files changed, 3711 insertions, 1016 deletions
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 6765949b3aab..380ad5ace7cf 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -169,7 +169,7 @@ static int afs_record_cm_probe(struct afs_call *call, struct afs_server *server) spin_lock(&server->probe_lock); - if (!test_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags)) { + if (!test_and_set_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags)) { server->cm_epoch = call->epoch; server->probe.cm_epoch = call->epoch; goto out; diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index e1b9ed679045..37d1bba57b00 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -32,9 +32,8 @@ void afs_fileserver_probe_result(struct afs_call *call) struct afs_server *server = call->server; unsigned int server_index = call->server_index; unsigned int index = call->addr_ix; - unsigned int rtt = UINT_MAX; + unsigned int rtt_us = 0; bool have_result = false; - u64 _rtt; int ret = call->error; _enter("%pU,%u", &server->uuid, index); @@ -93,15 +92,9 @@ responded: } } - /* Get the RTT and scale it to fit into a 32-bit value that represents - * over a minute of time so that we can access it with one instruction - * on a 32-bit system. - */ - _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall); - _rtt /= 64; - rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt; - if (rtt < server->probe.rtt) { - server->probe.rtt = rtt; + rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall); + if (rtt_us < server->probe.rtt) { + server->probe.rtt = rtt_us; alist->preferred = index; have_result = true; } @@ -113,15 +106,11 @@ out: spin_unlock(&server->probe_lock); _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", - server_index, index, &alist->addrs[index].transport, - (unsigned int)rtt, ret); + server_index, index, &alist->addrs[index].transport, rtt_us, ret); have_result |= afs_fs_probe_done(server); - if (have_result) { - server->probe.have_result = true; - wake_up_var(&server->probe.have_result); + if (have_result) wake_up_all(&server->probe_wq); - } } /* diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 68fc46634346..d2b3798c1932 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -385,8 +385,6 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) ASSERTCMP(req->offset, <=, PAGE_SIZE); if (req->offset == PAGE_SIZE) { req->offset = 0; - if (req->page_done) - req->page_done(req); req->index++; if (req->remain > 0) goto begin_page; @@ -440,11 +438,13 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) if (req->offset < PAGE_SIZE) zero_user_segment(req->pages[req->index], req->offset, PAGE_SIZE); - if (req->page_done) - req->page_done(req); req->offset = 0; } + if (req->page_done) + for (req->index = 0; req->index < req->nr_pages; req->index++) + req->page_done(req); + _leave(" = 0 [done]"); return 0; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index ef732dd4e7ef..80255513e230 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -533,12 +533,10 @@ struct afs_server { u32 abort_code; u32 cm_epoch; short error; - bool have_result; bool responded:1; bool is_yfs:1; bool not_yfs:1; bool local_failure:1; - bool no_epoch:1; bool cm_probed:1; bool said_rebooted:1; bool said_inconsistent:1; @@ -1335,7 +1333,7 @@ extern struct afs_volume *afs_create_volume(struct afs_fs_context *); extern void afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); extern void afs_put_volume(struct afs_cell *, struct afs_volume *); -extern int afs_check_volume_status(struct afs_volume *, struct key *); +extern int afs_check_volume_status(struct afs_volume *, struct afs_fs_cursor *); /* * write.c diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index 172ba569cd60..2a3305e42b14 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -192,7 +192,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) write_unlock(&vnode->volume->servers_lock); set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); - error = afs_check_volume_status(vnode->volume, fc->key); + error = afs_check_volume_status(vnode->volume, fc); if (error < 0) goto failed_set_error; @@ -281,7 +281,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc) set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags); set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags); - error = afs_check_volume_status(vnode->volume, fc->key); + error = afs_check_volume_status(vnode->volume, fc); if (error < 0) goto failed_set_error; @@ -341,7 +341,7 @@ start: /* See if we need to do an update of the volume record. Note that the * volume may have moved or even have been deleted. */ - error = afs_check_volume_status(vnode->volume, fc->key); + error = afs_check_volume_status(vnode->volume, fc); if (error < 0) goto failed_set_error; diff --git a/fs/afs/server.c b/fs/afs/server.c index b7f3cb2130ca..11b90ac7ea30 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -594,12 +594,9 @@ retry: } ret = wait_on_bit(&server->flags, AFS_SERVER_FL_UPDATING, - TASK_INTERRUPTIBLE); + (fc->flags & AFS_FS_CURSOR_INTR) ? + TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); if (ret == -ERESTARTSYS) { - if (!(fc->flags & AFS_FS_CURSOR_INTR) && server->addresses) { - _leave(" = t [intr]"); - return true; - } fc->error = ret; _leave(" = f [intr]"); return false; diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c index 858498cc1b05..e3aa013c2177 100644 --- a/fs/afs/vl_probe.c +++ b/fs/afs/vl_probe.c @@ -31,10 +31,9 @@ void afs_vlserver_probe_result(struct afs_call *call) struct afs_addr_list *alist = call->alist; struct afs_vlserver *server = call->vlserver; unsigned int server_index = call->server_index; + unsigned int rtt_us = 0; unsigned int index = call->addr_ix; - unsigned int rtt = UINT_MAX; bool have_result = false; - u64 _rtt; int ret = call->error; _enter("%s,%u,%u,%d,%d", server->name, server_index, index, ret, call->abort_code); @@ -93,15 +92,9 @@ responded: } } - /* Get the RTT and scale it to fit into a 32-bit value that represents - * over a minute of time so that we can access it with one instruction - * on a 32-bit system. - */ - _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall); - _rtt /= 64; - rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt; - if (rtt < server->probe.rtt) { - server->probe.rtt = rtt; + rtt_us = rxrpc_kernel_get_srtt(call->net->socket, call->rxcall); + if (rtt_us < server->probe.rtt) { + server->probe.rtt = rtt_us; alist->preferred = index; have_result = true; } @@ -113,8 +106,7 @@ out: spin_unlock(&server->probe_lock); _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", - server_index, index, &alist->addrs[index].transport, - (unsigned int)rtt, ret); + server_index, index, &alist->addrs[index].transport, rtt_us, ret); have_result |= afs_vl_probe_done(server); if (have_result) { diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index 9a5ce9687779..72eacc14e6e1 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -302,8 +302,8 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc) pr_notice("VC: - nr=%u/%u/%u pf=%u\n", a->nr_ipv4, a->nr_addrs, a->max_addrs, a->preferred); - pr_notice("VC: - pr=%lx R=%lx F=%lx\n", - a->probed, a->responded, a->failed); + pr_notice("VC: - R=%lx F=%lx\n", + a->responded, a->failed); if (a == vc->ac.alist) pr_notice("VC: - current\n"); } diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 92ca5e27573b..4310336b9bb8 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -281,7 +281,7 @@ error: /* * Make sure the volume record is up to date. */ -int afs_check_volume_status(struct afs_volume *volume, struct key *key) +int afs_check_volume_status(struct afs_volume *volume, struct afs_fs_cursor *fc) { time64_t now = ktime_get_real_seconds(); int ret, retries = 0; @@ -299,7 +299,7 @@ retry: } if (!test_and_set_bit_lock(AFS_VOLUME_UPDATING, &volume->flags)) { - ret = afs_update_volume_status(volume, key); + ret = afs_update_volume_status(volume, fc->key); clear_bit_unlock(AFS_VOLUME_WAIT, &volume->flags); clear_bit_unlock(AFS_VOLUME_UPDATING, &volume->flags); wake_up_bit(&volume->flags, AFS_VOLUME_WAIT); @@ -312,7 +312,9 @@ retry: return 0; } - ret = wait_on_bit(&volume->flags, AFS_VOLUME_WAIT, TASK_INTERRUPTIBLE); + ret = wait_on_bit(&volume->flags, AFS_VOLUME_WAIT, + (fc->flags & AFS_FS_CURSOR_INTR) ? + TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); if (ret == -ERESTARTSYS) { _leave(" = %d", ret); return ret; diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index b5b45c57e1b1..fe413e7a5cf4 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -497,8 +497,6 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) ASSERTCMP(req->offset, <=, PAGE_SIZE); if (req->offset == PAGE_SIZE) { req->offset = 0; - if (req->page_done) - req->page_done(req); req->index++; if (req->remain > 0) goto begin_page; @@ -556,11 +554,13 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) if (req->offset < PAGE_SIZE) zero_user_segment(req->pages[req->index], req->offset, PAGE_SIZE); - if (req->page_done) - req->page_done(req); req->offset = 0; } + if (req->page_done) + for (req->index = 0; req->index < req->nr_pages; req->index++) + req->page_done(req); + _leave(" = 0 [done]"); return 0; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 13f25e241ac4..25d489bc9453 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1733,7 +1733,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, (!regset->active || regset->active(t->task, regset) > 0)) { int ret; size_t size = regset_size(t->task, regset); - void *data = kmalloc(size, GFP_KERNEL); + void *data = kzalloc(size, GFP_KERNEL); if (unlikely(!data)) return 0; ret = regset->get(t->task, regset, diff --git a/fs/block_dev.c b/fs/block_dev.c index 52b6f646cdbd..93672c3f1c78 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -19,7 +19,6 @@ #include <linux/module.h> #include <linux/blkpg.h> #include <linux/magic.h> -#include <linux/dax.h> #include <linux/buffer_head.h> #include <linux/swap.h> #include <linux/pagevec.h> @@ -1893,6 +1892,16 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) struct gendisk *disk = bdev->bd_disk; struct block_device *victim = NULL; + /* + * Sync early if it looks like we're the last one. If someone else + * opens the block device between now and the decrement of bd_openers + * then we did a sync that we didn't need to, but that's not the end + * of the world and we want to avoid long (could be several minute) + * syncs while holding the mutex. + */ + if (bdev->bd_openers == 1) + sync_blockdev(bdev); + mutex_lock_nested(&bdev->bd_mutex, for_part); if (for_part) bdev->bd_part_count--; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 9c380e7edf62..0cc02577577b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -391,7 +391,7 @@ static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr) struct rb_node **p = &preftrees->direct.root.rb_root.rb_node; struct rb_node *parent = NULL; struct prelim_ref *ref = NULL; - struct prelim_ref target = {0}; + struct prelim_ref target = {}; int result; target.parent = bytenr; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 47f66c6a7d7f..696f47103cfc 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -916,7 +916,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; - goto out; + goto out_put_group; } /* @@ -954,7 +954,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { btrfs_add_delayed_iput(inode); - goto out; + goto out_put_group; } clear_nlink(inode); /* One for the block groups ref */ @@ -977,13 +977,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); if (ret < 0) - goto out; + goto out_put_group; if (ret > 0) btrfs_release_path(path); if (ret == 0) { ret = btrfs_del_item(trans, tree_root, path); if (ret) - goto out; + goto out_put_group; btrfs_release_path(path); } @@ -1102,9 +1102,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ret = remove_block_group_free_space(trans, block_group); if (ret) - goto out; + goto out_put_group; - btrfs_put_block_group(block_group); + /* Once for the block groups rbtree */ btrfs_put_block_group(block_group); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); @@ -1127,6 +1127,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, /* once for the tree */ free_extent_map(em); } + +out_put_group: + /* Once for the lookup reference */ + btrfs_put_block_group(block_group); out: if (remove_rsv) btrfs_delayed_refs_rsv_release(fs_info, 1); @@ -1288,11 +1292,15 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, if (ret) goto err; mutex_unlock(&fs_info->unused_bg_unpin_mutex); + if (prev_trans) + btrfs_put_transaction(prev_trans); return true; err: mutex_unlock(&fs_info->unused_bg_unpin_mutex); + if (prev_trans) + btrfs_put_transaction(prev_trans); btrfs_dec_block_group_ro(bg); return false; } diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h index 21a15776dac4..353228d62f5a 100644 --- a/fs/btrfs/discard.h +++ b/fs/btrfs/discard.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef BTRFS_DISCARD_H #define BTRFS_DISCARD_H diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a6cb5cbbdb9f..d10c7be10f3b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2036,9 +2036,6 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) for (i = 0; i < ret; i++) btrfs_drop_and_free_fs_root(fs_info, gang[i]); } - - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) - btrfs_free_log_root_tree(NULL, fs_info); } static void btrfs_init_scrub(struct btrfs_fs_info *fs_info) @@ -3888,7 +3885,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->fs_roots_radix_lock); if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { - btrfs_free_log(NULL, root); + ASSERT(root->log_root == NULL); if (root->reloc_root) { btrfs_put_root(root->reloc_root); root->reloc_root = NULL; @@ -4211,6 +4208,36 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) up_write(&fs_info->cleanup_work_sem); } +static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *gang[8]; + u64 root_objectid = 0; + int ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang))) != 0) { + int i; + + for (i = 0; i < ret; i++) + gang[i] = btrfs_grab_root(gang[i]); + spin_unlock(&fs_info->fs_roots_radix_lock); + + for (i = 0; i < ret; i++) { + if (!gang[i]) + continue; + root_objectid = gang[i]->root_key.objectid; + btrfs_free_log(NULL, gang[i]); + btrfs_put_root(gang[i]); + } + root_objectid++; + spin_lock(&fs_info->fs_roots_radix_lock); + } + spin_unlock(&fs_info->fs_roots_radix_lock); + btrfs_free_log_root_tree(NULL, fs_info); +} + static void btrfs_destroy_ordered_extents(struct btrfs_root *root) { struct btrfs_ordered_extent *ordered; @@ -4603,6 +4630,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) btrfs_destroy_delayed_inodes(fs_info); btrfs_assert_delayed_root_empty(fs_info); btrfs_destroy_all_delalloc_inodes(fs_info); + btrfs_drop_all_logs(fs_info); mutex_unlock(&fs_info->transaction_kthread_mutex); return 0; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d35936c934ab..03bc7134e8cb 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4559,6 +4559,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (IS_ERR(fs_root)) { err = PTR_ERR(fs_root); list_add_tail(&reloc_root->root_list, &reloc_roots); + btrfs_end_transaction(trans); goto out_unset; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8cede6eb9843..2d5498136e5e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -662,10 +662,19 @@ again: } got_it: - btrfs_record_root_in_trans(h, root); - if (!current->journal_info) current->journal_info = h; + + /* + * btrfs_record_root_in_trans() needs to alloc new extents, and may + * call btrfs_join_transaction() while we're also starting a + * transaction. + * + * Thus it need to be called after current->journal_info initialized, + * or we can deadlock. + */ + btrfs_record_root_in_trans(h, root); + return h; join_fail: diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ec36a7c6ba3d..02ebdd9edc19 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4226,6 +4226,9 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, const u64 ino = btrfs_ino(inode); struct btrfs_path *dst_path = NULL; bool dropped_extents = false; + u64 truncate_offset = i_size; + struct extent_buffer *leaf; + int slot; int ins_nr = 0; int start_slot; int ret; @@ -4240,9 +4243,43 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, if (ret < 0) goto out; + /* + * We must check if there is a prealloc extent that starts before the + * i_size and crosses the i_size boundary. This is to ensure later we + * truncate down to the end of that extent and not to the i_size, as + * otherwise we end up losing part of the prealloc extent after a log + * replay and with an implicit hole if there is another prealloc extent + * that starts at an offset beyond i_size. + */ + ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY); + if (ret < 0) + goto out; + + if (ret == 0) { + struct btrfs_file_extent_item *ei; + + leaf = path->nodes[0]; + slot = path->slots[0]; + ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, ei) == + BTRFS_FILE_EXTENT_PREALLOC) { + u64 extent_end; + + btrfs_item_key_to_cpu(leaf, &key, slot); + extent_end = key.offset + + btrfs_file_extent_num_bytes(leaf, ei); + + if (extent_end > i_size) + truncate_offset = extent_end; + } + } else { + ret = 0; + } + while (true) { - struct extent_buffer *leaf = path->nodes[0]; - int slot = path->slots[0]; + leaf = path->nodes[0]; + slot = path->slots[0]; if (slot >= btrfs_header_nritems(leaf)) { if (ins_nr > 0) { @@ -4280,7 +4317,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, ret = btrfs_truncate_inode_items(trans, root->log_root, &inode->vfs_inode, - i_size, + truncate_offset, BTRFS_EXTENT_DATA_KEY); } while (ret == -EAGAIN); if (ret) diff --git a/fs/buffer.c b/fs/buffer.c index 599a0bf7257b..a60f60396cfa 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -967,7 +967,7 @@ grow_dev_page(struct block_device *bdev, sector_t block, struct page *page; struct buffer_head *bh; sector_t end_block; - int ret = 0; /* Will call free_more_memory() */ + int ret = 0; gfp_t gfp_mask; gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp; diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 1dc97f2d6201..e7726f5f1241 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -60,9 +60,9 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, object = container_of(op->op.object, struct cachefiles_object, fscache); spin_lock(&object->work_lock); list_add_tail(&monitor->op_link, &op->to_do); + fscache_enqueue_retrieval(op); spin_unlock(&object->work_lock); - fscache_enqueue_retrieval(op); fscache_put_retrieval(op); return 0; } @@ -398,7 +398,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, struct inode *inode; sector_t block; unsigned shift; - int ret; + int ret, ret2; object = container_of(op->op.object, struct cachefiles_object, fscache); @@ -430,8 +430,8 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, block = page->index; block <<= shift; - ret = bmap(inode, &block); - ASSERT(ret < 0); + ret2 = bmap(inode, &block); + ASSERT(ret2 == 0); _debug("%llx -> %llx", (unsigned long long) (page->index << shift), @@ -739,8 +739,8 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, block = page->index; block <<= shift; - ret = bmap(inode, &block); - ASSERT(!ret); + ret2 = bmap(inode, &block); + ASSERT(ret2 == 0); _debug("%llx -> %llx", (unsigned long long) (page->index << shift), diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 185db76300b3..f1acde6fb9a6 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2749,7 +2749,7 @@ int ceph_try_get_caps(struct inode *inode, int need, int want, ret = try_get_cap_refs(inode, need, want, 0, flags, got); /* three special error codes */ - if (ret == -EAGAIN || ret == -EFBIG || ret == -EAGAIN) + if (ret == -EAGAIN || ret == -EFBIG || ret == -ESTALE) ret = 0; return ret; } @@ -3746,6 +3746,7 @@ retry: WARN_ON(1); tsession = NULL; target = -1; + mutex_lock(&session->s_mutex); } goto retry; @@ -3990,7 +3991,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, __ceph_queue_cap_release(session, cap); spin_unlock(&session->s_cap_lock); } - goto done; + goto flush_cap_releases; } /* these will work even if we don't have a cap yet */ diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 481ac97b4d25..dcaed75de9e6 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -271,7 +271,7 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) &congestion_kb_fops); snprintf(name, sizeof(name), "../../bdi/%s", - dev_name(fsc->sb->s_bdi->dev)); + bdi_dev_name(fsc->sb->s_bdi)); fsc->debugfs_bdi = debugfs_create_symlink("bdi", fsc->client->debugfs_dir, diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 486f91f9685b..7c63abf5bea9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3251,8 +3251,7 @@ static void handle_session(struct ceph_mds_session *session, void *end = p + msg->front.iov_len; struct ceph_mds_session_head *h; u32 op; - u64 seq; - unsigned long features = 0; + u64 seq, features = 0; int wake = 0; bool blacklisted = false; @@ -3271,9 +3270,8 @@ static void handle_session(struct ceph_mds_session *session, goto bad; /* version >= 3, feature bits */ ceph_decode_32_safe(&p, end, len, bad); - ceph_decode_need(&p, end, len, bad); - memcpy(&features, p, min_t(size_t, len, sizeof(features))); - p += len; + ceph_decode_64_safe(&p, end, features, bad); + p += len - sizeof(features); } mutex_lock(&mdsc->mutex); diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index de56dee60540..19507e2fdb57 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -159,8 +159,8 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc, } if (IS_ERR(in)) { - pr_warn("Can't lookup inode %llx (err: %ld)\n", - realm->ino, PTR_ERR(in)); + dout("Can't lookup inode %llx (err: %ld)\n", + realm->ino, PTR_ERR(in)); qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */ } else { qri->timeout = 0; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 05dd3dea684b..39b708d9d86d 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1891,7 +1891,8 @@ GLOBAL_EXTERN struct list_head cifs_tcp_ses_list; /* * This lock protects the cifs_tcp_ses_list, the list of smb sessions per * tcp session, and the list of tcon's per smb session. It also protects - * the reference counters for the server, smb session, and tcon. Finally, + * the reference counters for the server, smb session, and tcon. It also + * protects some fields in the TCP_Server_Info struct such as dstaddr. Finally, * changes to the tcon->tidStatus should be done while holding this lock. * generally the locks should be taken in order tcp_ses_lock before * tcon->open_file_lock and that before file->file_info_lock since the diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 182b864b3075..5014a82391ff 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2152,8 +2152,8 @@ cifs_writev_requeue(struct cifs_writedata *wdata) } } + kref_put(&wdata2->refcount, cifs_writedata_release); if (rc) { - kref_put(&wdata2->refcount, cifs_writedata_release); if (is_retryable_error(rc)) continue; i += nr_pages; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 95b3ab0ca8c0..28268ed461b8 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -375,8 +375,10 @@ static int reconn_set_ipaddr(struct TCP_Server_Info *server) return rc; } + spin_lock(&cifs_tcp_ses_lock); rc = cifs_convert_address((struct sockaddr *)&server->dstaddr, ipaddr, strlen(ipaddr)); + spin_unlock(&cifs_tcp_ses_lock); kfree(ipaddr); return !rc ? -1 : 0; @@ -3373,6 +3375,10 @@ cifs_find_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) spin_lock(&cifs_tcp_ses_lock); list_for_each(tmp, &ses->tcon_list) { tcon = list_entry(tmp, struct cifs_tcon, tcon_list); +#ifdef CONFIG_CIFS_DFS_UPCALL + if (tcon->dfs_path) + continue; +#endif if (!match_tcon(tcon, volume_info)) continue; ++tcon->tc_count; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0b1528edebcf..75ddce8ef456 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4060,7 +4060,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) * than it negotiated since it will refuse the read * then. */ - if ((tcon->ses) && !(tcon->ses->capabilities & + if (!(tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files)) { current_read_size = min_t(uint, current_read_size, CIFSMaxBufSize); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 390d2b15ef6e..5d2965a23730 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -730,7 +730,7 @@ static __u64 simple_hashstr(const char *str) * cifs_backup_query_path_info - SMB1 fallback code to get ino * * Fallback code to get file metadata when we don't have access to - * @full_path (EACCESS) and have backup creds. + * @full_path (EACCES) and have backup creds. * * @data will be set to search info result buffer * @resp_buf will be set to cifs resp buf and needs to be freed with diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index a456febd4109..550ce9020a3e 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1025,51 +1025,99 @@ int copy_path_name(char *dst, const char *src) } struct super_cb_data { - struct TCP_Server_Info *server; + void *data; struct super_block *sb; }; -static void super_cb(struct super_block *sb, void *arg) +static void tcp_super_cb(struct super_block *sb, void *arg) { - struct super_cb_data *d = arg; + struct super_cb_data *sd = arg; + struct TCP_Server_Info *server = sd->data; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; - if (d->sb) + if (sd->sb) return; cifs_sb = CIFS_SB(sb); tcon = cifs_sb_master_tcon(cifs_sb); - if (tcon->ses->server == d->server) - d->sb = sb; + if (tcon->ses->server == server) + sd->sb = sb; } -struct super_block *cifs_get_tcp_super(struct TCP_Server_Info *server) +static struct super_block *__cifs_get_super(void (*f)(struct super_block *, void *), + void *data) { - struct super_cb_data d = { - .server = server, + struct super_cb_data sd = { + .data = data, .sb = NULL, }; - iterate_supers_type(&cifs_fs_type, super_cb, &d); + iterate_supers_type(&cifs_fs_type, f, &sd); - if (unlikely(!d.sb)) - return ERR_PTR(-ENOENT); + if (!sd.sb) + return ERR_PTR(-EINVAL); /* * Grab an active reference in order to prevent automounts (DFS links) * of expiring and then freeing up our cifs superblock pointer while * we're doing failover. */ - cifs_sb_active(d.sb); - return d.sb; + cifs_sb_active(sd.sb); + return sd.sb; } -void cifs_put_tcp_super(struct super_block *sb) +static void __cifs_put_super(struct super_block *sb) { if (!IS_ERR_OR_NULL(sb)) cifs_sb_deactive(sb); } +struct super_block *cifs_get_tcp_super(struct TCP_Server_Info *server) +{ + return __cifs_get_super(tcp_super_cb, server); +} + +void cifs_put_tcp_super(struct super_block *sb) +{ + __cifs_put_super(sb); +} + +#ifdef CONFIG_CIFS_DFS_UPCALL +static void tcon_super_cb(struct super_block *sb, void *arg) +{ + struct super_cb_data *sd = arg; + struct cifs_tcon *tcon = sd->data; + struct cifs_sb_info *cifs_sb; + + if (sd->sb) + return; + + cifs_sb = CIFS_SB(sb); + if (tcon->dfs_path && cifs_sb->origin_fullpath && + !strcasecmp(tcon->dfs_path, cifs_sb->origin_fullpath)) + sd->sb = sb; +} + +static inline struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon) +{ + return __cifs_get_super(tcon_super_cb, tcon); +} + +static inline void cifs_put_tcon_super(struct super_block *sb) +{ + __cifs_put_super(sb); +} +#else +static inline struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void cifs_put_tcon_super(struct super_block *sb) +{ +} +#endif + int update_super_prepath(struct cifs_tcon *tcon, const char *prefix, size_t prefix_len) { @@ -1077,7 +1125,7 @@ int update_super_prepath(struct cifs_tcon *tcon, const char *prefix, struct cifs_sb_info *cifs_sb; int rc = 0; - sb = cifs_get_tcp_super(tcon->ses->server); + sb = cifs_get_tcon_super(tcon); if (IS_ERR(sb)) return PTR_ERR(sb); @@ -1099,6 +1147,6 @@ int update_super_prepath(struct cifs_tcon *tcon, const char *prefix, cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH; out: - cifs_put_tcp_super(sb); + cifs_put_tcon_super(sb); return rc; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index b36c46f48705..f829f4165d38 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -687,6 +687,11 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; + if (!server->ops->new_lease_key) + return -EIO; + + server->ops->new_lease_key(pfid); + memset(rqst, 0, sizeof(rqst)); resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER; memset(rsp_iov, 0, sizeof(rsp_iov)); diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index cf7b7e1d5bd7..cb733652ecca 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1519,6 +1519,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) spin_lock(&configfs_dirent_lock); configfs_detach_rollback(dentry); spin_unlock(&configfs_dirent_lock); + config_item_put(parent_item); return -EINTR; } frag->frag_dead = true; diff --git a/fs/coredump.c b/fs/coredump.c index f8296a82d01d..478a0d810136 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -211,6 +211,8 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, return -ENOMEM; (*argv)[(*argc)++] = 0; ++pat_ptr; + if (!(*pat_ptr)) + return -ENOMEM; } /* Repeat as long as we have more pattern to process and more output @@ -786,6 +788,14 @@ void do_coredump(const kernel_siginfo_t *siginfo) if (displaced) put_files_struct(displaced); if (!dump_interrupted()) { + /* + * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would + * have this set to NULL. + */ + if (!cprm.file) { + pr_info("Core dump to |%s disabled\n", cn.corename); + goto close_fail; + } file_start_write(cprm.file); core_dumped = binfmt->core_dump(&cprm); file_end_write(cprm.file); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 1ecaac7ee3cb..ed015cb66c7c 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -54,6 +54,7 @@ struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags) /** * fscrypt_free_bounce_page() - free a ciphertext bounce page + * @bounce_page: the bounce page to free, or NULL * * Free a bounce page that was allocated by fscrypt_encrypt_pagecache_blocks(), * or by fscrypt_alloc_bounce_page() directly. @@ -76,8 +77,12 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, memset(iv, 0, ci->ci_mode->ivsize); if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { - WARN_ON_ONCE((u32)lblk_num != lblk_num); + WARN_ON_ONCE(lblk_num > U32_MAX); + WARN_ON_ONCE(ci->ci_inode->i_ino > U32_MAX); lblk_num |= (u64)ci->ci_inode->i_ino << 32; + } else if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) { + WARN_ON_ONCE(lblk_num > U32_MAX); + lblk_num = (u32)(ci->ci_hashed_ino + lblk_num); } else if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { memcpy(iv->nonce, ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE); } @@ -132,7 +137,8 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, } /** - * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a pagecache page + * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a + * pagecache page * @page: The locked pagecache page containing the block(s) to encrypt * @len: Total size of the block(s) to encrypt. Must be a nonzero * multiple of the filesystem's block size. @@ -222,7 +228,8 @@ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); /** - * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a pagecache page + * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a + * pagecache page * @page: The locked pagecache page containing the block(s) to decrypt * @len: Total size of the block(s) to decrypt. Must be a nonzero * multiple of the filesystem's block size. @@ -346,6 +353,8 @@ void fscrypt_msg(const struct inode *inode, const char *level, /** * fscrypt_init() - Set up for fs encryption. + * + * Return: 0 on success; -errno on failure */ static int __init fscrypt_init(void) { diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 4c212442a8f7..83ca5f1e7934 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -18,7 +18,7 @@ #include <crypto/skcipher.h> #include "fscrypt_private.h" -/** +/* * struct fscrypt_nokey_name - identifier for directory entry when key is absent * * When userspace lists an encrypted directory without access to the key, the @@ -83,13 +83,8 @@ static int fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result) tfm = prev_tfm; } } - { - SHASH_DESC_ON_STACK(desc, tfm); - desc->tfm = tfm; - - return crypto_shash_digest(desc, data, data_len, result); - } + return crypto_shash_tfm_digest(tfm, data, data_len, result); } static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) @@ -105,9 +100,12 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) /** * fscrypt_fname_encrypt() - encrypt a filename - * - * The output buffer must be at least as large as the input buffer. - * Any extra space is filled with NUL padding before encryption. + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @iname: the filename to encrypt + * @out: (output) the encrypted filename + * @olen: size of the encrypted filename. It must be at least @iname->len. + * Any extra space is filled with NUL padding before encryption. * * Return: 0 on success, -errno on failure */ @@ -157,8 +155,11 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, /** * fname_decrypt() - decrypt a filename - * - * The caller must have allocated sufficient memory for the @oname string. + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @iname: the encrypted filename to decrypt + * @oname: (output) the decrypted filename. The caller must have allocated + * enough space for this, e.g. using fscrypt_fname_alloc_buffer(). * * Return: 0 on success, -errno on failure */ @@ -206,7 +207,10 @@ static const char lookup_table[65] = #define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) /** - * base64_encode() - + * base64_encode() - base64-encode some bytes + * @src: the bytes to encode + * @len: number of bytes to encode + * @dst: (output) the base64-encoded string. Not NUL-terminated. * * Encodes the input string using characters from the set [A-Za-z0-9+,]. * The encoded string is roughly 4/3 times the size of the input string. @@ -272,7 +276,12 @@ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, } /** - * fscrypt_fname_alloc_buffer - allocate a buffer for presented filenames + * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @max_encrypted_len: maximum length of encrypted filenames the buffer will be + * used to present + * @crypto_str: (output) buffer to allocate * * Allocate a buffer that is large enough to hold any decrypted or encoded * filename (null-terminated), for the given maximum encrypted filename length. @@ -297,9 +306,10 @@ int fscrypt_fname_alloc_buffer(const struct inode *inode, EXPORT_SYMBOL(fscrypt_fname_alloc_buffer); /** - * fscrypt_fname_free_buffer - free the buffer for presented filenames + * fscrypt_fname_free_buffer() - free a buffer for presented filenames + * @crypto_str: the buffer to free * - * Free the buffer allocated by fscrypt_fname_alloc_buffer(). + * Free a buffer that was allocated by fscrypt_fname_alloc_buffer(). */ void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { @@ -311,10 +321,19 @@ void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** - * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user - * space - * - * The caller must have allocated sufficient memory for the @oname string. + * fscrypt_fname_disk_to_usr() - convert an encrypted filename to + * user-presentable form + * @inode: inode of the parent directory (for regular filenames) + * or of the symlink (for symlink targets) + * @hash: first part of the name's dirhash, if applicable. This only needs to + * be provided if the filename is located in an indexed directory whose + * encryption key may be unavailable. Not needed for symlink targets. + * @minor_hash: second part of the name's dirhash, if applicable + * @iname: encrypted filename to convert. May also be "." or "..", which + * aren't actually encrypted. + * @oname: output buffer for the user-presentable filename. The caller must + * have allocated enough space for this, e.g. using + * fscrypt_fname_alloc_buffer(). * * If the key is available, we'll decrypt the disk name. Otherwise, we'll * encode it for presentation in fscrypt_nokey_name format. diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index dbced2937ec8..eb7fcd2b7fb8 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -43,7 +43,7 @@ struct fscrypt_context_v2 { u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; }; -/** +/* * fscrypt_context - the encryption context of an inode * * This is the on-disk equivalent of an fscrypt_policy, stored alongside each @@ -157,7 +157,7 @@ fscrypt_policy_flags(const union fscrypt_policy *policy) BUG(); } -/** +/* * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. */ @@ -222,6 +222,9 @@ struct fscrypt_info { /* This inode's nonce, copied from the fscrypt_context */ u8 ci_nonce[FS_KEY_DERIVATION_NONCE_SIZE]; + + /* Hashed inode number. Only set for IV_INO_LBLK_32 */ + u32 ci_hashed_ino; }; typedef enum { @@ -231,15 +234,14 @@ typedef enum { /* crypto.c */ extern struct kmem_cache *fscrypt_info_cachep; -extern int fscrypt_initialize(unsigned int cop_flags); -extern int fscrypt_crypt_block(const struct inode *inode, - fscrypt_direction_t rw, u64 lblk_num, - struct page *src_page, struct page *dest_page, - unsigned int len, unsigned int offs, - gfp_t gfp_flags); -extern struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); - -extern void __printf(3, 4) __cold +int fscrypt_initialize(unsigned int cop_flags); +int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, + u64 lblk_num, struct page *src_page, + struct page *dest_page, unsigned int len, + unsigned int offs, gfp_t gfp_flags); +struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); + +void __printf(3, 4) __cold fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...); #define fscrypt_warn(inode, fmt, ...) \ @@ -264,12 +266,10 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, const struct fscrypt_info *ci); /* fname.c */ -extern int fscrypt_fname_encrypt(const struct inode *inode, - const struct qstr *iname, - u8 *out, unsigned int olen); -extern bool fscrypt_fname_encrypted_size(const struct inode *inode, - u32 orig_len, u32 max_len, - u32 *encrypted_len_ret); +int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, + u8 *out, unsigned int olen); +bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, + u32 max_len, u32 *encrypted_len_ret); extern const struct dentry_operations fscrypt_d_ops; /* hkdf.c */ @@ -278,8 +278,8 @@ struct fscrypt_hkdf { struct crypto_shash *hmac_tfm; }; -extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, - unsigned int master_key_size); +int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, + unsigned int master_key_size); /* * The list of contexts in which fscrypt uses HKDF. These values are used as @@ -293,12 +293,14 @@ extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, #define HKDF_CONTEXT_DIRECT_KEY 3 #define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 #define HKDF_CONTEXT_DIRHASH_KEY 5 +#define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6 +#define HKDF_CONTEXT_INODE_HASH_KEY 7 -extern int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, - const u8 *info, unsigned int infolen, - u8 *okm, unsigned int okmlen); +int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, + const u8 *info, unsigned int infolen, + u8 *okm, unsigned int okmlen); -extern void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); +void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); /* keyring.c */ @@ -389,14 +391,17 @@ struct fscrypt_master_key { struct list_head mk_decrypted_inodes; spinlock_t mk_decrypted_inodes_lock; - /* Crypto API transforms for DIRECT_KEY policies, allocated on-demand */ - struct crypto_skcipher *mk_direct_tfms[__FSCRYPT_MODE_MAX + 1]; - /* - * Crypto API transforms for filesystem-layer implementation of - * IV_INO_LBLK_64 policies, allocated on-demand. + * Per-mode encryption keys for the various types of encryption policies + * that use them. Allocated and derived on-demand. */ - struct crypto_skcipher *mk_iv_ino_lblk_64_tfms[__FSCRYPT_MODE_MAX + 1]; + struct crypto_skcipher *mk_direct_keys[__FSCRYPT_MODE_MAX + 1]; + struct crypto_skcipher *mk_iv_ino_lblk_64_keys[__FSCRYPT_MODE_MAX + 1]; + struct crypto_skcipher *mk_iv_ino_lblk_32_keys[__FSCRYPT_MODE_MAX + 1]; + + /* Hash key for inode numbers. Initialized only when needed. */ + siphash_key_t mk_ino_hash_key; + bool mk_ino_hash_key_initialized; } __randomize_layout; @@ -436,14 +441,17 @@ static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec) return 0; } -extern struct key * +struct key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); -extern int fscrypt_verify_key_added(struct super_block *sb, - const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); +int fscrypt_add_test_dummy_key(struct super_block *sb, + struct fscrypt_key_specifier *key_spec); + +int fscrypt_verify_key_added(struct super_block *sb, + const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); -extern int __init fscrypt_init_keyring(void); +int __init fscrypt_init_keyring(void); /* keysetup.c */ @@ -457,33 +465,32 @@ struct fscrypt_mode { extern struct fscrypt_mode fscrypt_modes[]; -extern struct crypto_skcipher * -fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, - const struct inode *inode); +struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode, + const u8 *raw_key, + const struct inode *inode); -extern int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, - const u8 *raw_key); +int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key); -extern int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, - const struct fscrypt_master_key *mk); +int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, + const struct fscrypt_master_key *mk); /* keysetup_v1.c */ -extern void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); +void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); + +int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, + const u8 *raw_master_key); -extern int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, - const u8 *raw_master_key); +int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci); -extern int fscrypt_setup_v1_file_key_via_subscribed_keyrings( - struct fscrypt_info *ci); /* policy.c */ -extern bool fscrypt_policies_equal(const union fscrypt_policy *policy1, - const union fscrypt_policy *policy2); -extern bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, - const struct inode *inode); -extern int fscrypt_policy_from_context(union fscrypt_policy *policy_u, - const union fscrypt_context *ctx_u, - int ctx_size); +bool fscrypt_policies_equal(const union fscrypt_policy *policy1, + const union fscrypt_policy *policy2); +bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, + const struct inode *inode); +int fscrypt_policy_from_context(union fscrypt_policy *policy_u, + const union fscrypt_context *ctx_u, + int ctx_size); #endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index efb95bd19a89..0cba7928446d 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -44,17 +44,13 @@ static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, unsigned int ikmlen, u8 prk[HKDF_HASHLEN]) { static const u8 default_salt[HKDF_HASHLEN]; - SHASH_DESC_ON_STACK(desc, hmac_tfm); int err; err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN); if (err) return err; - desc->tfm = hmac_tfm; - err = crypto_shash_digest(desc, ikm, ikmlen, prk); - shash_desc_zero(desc); - return err; + return crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk); } /* diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 5ef861742921..09fb8aa0f2e9 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -10,7 +10,7 @@ #include "fscrypt_private.h" /** - * fscrypt_file_open - prepare to open a possibly-encrypted regular file + * fscrypt_file_open() - prepare to open a possibly-encrypted regular file * @inode: the inode being opened * @filp: the struct file being set up * @@ -262,7 +262,7 @@ err_free_sd: EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink); /** - * fscrypt_get_symlink - get the target of an encrypted symlink + * fscrypt_get_symlink() - get the target of an encrypted symlink * @inode: the symlink inode * @caddr: the on-disk contents of the symlink * @max_size: size of @caddr buffer diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index ab41b25d4fa1..e24eb48bfbe1 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -20,6 +20,7 @@ #include <crypto/skcipher.h> #include <linux/key-type.h> +#include <linux/random.h> #include <linux/seq_file.h> #include "fscrypt_private.h" @@ -44,8 +45,9 @@ static void free_master_key(struct fscrypt_master_key *mk) wipe_master_key_secret(&mk->mk_secret); for (i = 0; i <= __FSCRYPT_MODE_MAX; i++) { - crypto_free_skcipher(mk->mk_direct_tfms[i]); - crypto_free_skcipher(mk->mk_iv_ino_lblk_64_tfms[i]); + crypto_free_skcipher(mk->mk_direct_keys[i]); + crypto_free_skcipher(mk->mk_iv_ino_lblk_64_keys[i]); + crypto_free_skcipher(mk->mk_iv_ino_lblk_32_keys[i]); } key_put(mk->mk_users); @@ -424,9 +426,9 @@ static int add_existing_master_key(struct fscrypt_master_key *mk, return 0; } -static int add_master_key(struct super_block *sb, - struct fscrypt_master_key_secret *secret, - const struct fscrypt_key_specifier *mk_spec) +static int do_add_master_key(struct super_block *sb, + struct fscrypt_master_key_secret *secret, + const struct fscrypt_key_specifier *mk_spec) { static DEFINE_MUTEX(fscrypt_add_key_mutex); struct key *key; @@ -465,6 +467,35 @@ out_unlock: return err; } +static int add_master_key(struct super_block *sb, + struct fscrypt_master_key_secret *secret, + struct fscrypt_key_specifier *key_spec) +{ + int err; + + if (key_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { + err = fscrypt_init_hkdf(&secret->hkdf, secret->raw, + secret->size); + if (err) + return err; + + /* + * Now that the HKDF context is initialized, the raw key is no + * longer needed. + */ + memzero_explicit(secret->raw, secret->size); + + /* Calculate the key identifier */ + err = fscrypt_hkdf_expand(&secret->hkdf, + HKDF_CONTEXT_KEY_IDENTIFIER, NULL, 0, + key_spec->u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); + if (err) + return err; + } + return do_add_master_key(sb, secret, key_spec); +} + static int fscrypt_provisioning_key_preparse(struct key_preparsed_payload *prep) { const struct fscrypt_provisioning_key_payload *payload = prep->data; @@ -609,6 +640,15 @@ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg) if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) return -EINVAL; + /* + * Only root can add keys that are identified by an arbitrary descriptor + * rather than by a cryptographic hash --- since otherwise a malicious + * user could add the wrong key. + */ + if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && + !capable(CAP_SYS_ADMIN)) + return -EACCES; + memset(&secret, 0, sizeof(secret)); if (arg.key_id) { if (arg.raw_size != 0) @@ -626,48 +666,17 @@ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg) goto out_wipe_secret; } - switch (arg.key_spec.type) { - case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: - /* - * Only root can add keys that are identified by an arbitrary - * descriptor rather than by a cryptographic hash --- since - * otherwise a malicious user could add the wrong key. - */ - err = -EACCES; - if (!capable(CAP_SYS_ADMIN)) - goto out_wipe_secret; - break; - case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: - err = fscrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size); - if (err) - goto out_wipe_secret; - - /* - * Now that the HKDF context is initialized, the raw key is no - * longer needed. - */ - memzero_explicit(secret.raw, secret.size); - - /* Calculate the key identifier and return it to userspace. */ - err = fscrypt_hkdf_expand(&secret.hkdf, - HKDF_CONTEXT_KEY_IDENTIFIER, - NULL, 0, arg.key_spec.u.identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE); - if (err) - goto out_wipe_secret; - err = -EFAULT; - if (copy_to_user(uarg->key_spec.u.identifier, - arg.key_spec.u.identifier, - FSCRYPT_KEY_IDENTIFIER_SIZE)) - goto out_wipe_secret; - break; - default: - WARN_ON(1); - err = -EINVAL; + err = add_master_key(sb, &secret, &arg.key_spec); + if (err) goto out_wipe_secret; - } - err = add_master_key(sb, &secret, &arg.key_spec); + /* Return the key identifier to userspace, if applicable */ + err = -EFAULT; + if (arg.key_spec.type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER && + copy_to_user(uarg->key_spec.u.identifier, arg.key_spec.u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE)) + goto out_wipe_secret; + err = 0; out_wipe_secret: wipe_master_key_secret(&secret); return err; @@ -675,6 +684,29 @@ out_wipe_secret: EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key); /* + * Add the key for '-o test_dummy_encryption' to the filesystem keyring. + * + * Use a per-boot random key to prevent people from misusing this option. + */ +int fscrypt_add_test_dummy_key(struct super_block *sb, + struct fscrypt_key_specifier *key_spec) +{ + static u8 test_key[FSCRYPT_MAX_KEY_SIZE]; + struct fscrypt_master_key_secret secret; + int err; + + get_random_once(test_key, FSCRYPT_MAX_KEY_SIZE); + + memset(&secret, 0, sizeof(secret)); + secret.size = FSCRYPT_MAX_KEY_SIZE; + memcpy(secret.raw, test_key, FSCRYPT_MAX_KEY_SIZE); + + err = add_master_key(sb, &secret, key_spec); + wipe_master_key_secret(&secret); + return err; +} + +/* * Verify that the current user has added a master key with the given identifier * (returns -ENOKEY if not). This is needed to prevent a user from encrypting * their files using some other user's key which they don't actually know. diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 302375e9f719..1129adfa097d 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -46,6 +46,8 @@ struct fscrypt_mode fscrypt_modes[] = { }, }; +static DEFINE_MUTEX(fscrypt_mode_key_setup_mutex); + static struct fscrypt_mode * select_encryption_mode(const union fscrypt_policy *policy, const struct inode *inode) @@ -130,7 +132,7 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, const struct super_block *sb = inode->i_sb; struct fscrypt_mode *mode = ci->ci_mode; const u8 mode_num = mode - fscrypt_modes; - struct crypto_skcipher *tfm, *prev_tfm; + struct crypto_skcipher *tfm; u8 mode_key[FSCRYPT_MAX_KEY_SIZE]; u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)]; unsigned int hkdf_infolen = 0; @@ -139,10 +141,17 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, if (WARN_ON(mode_num > __FSCRYPT_MODE_MAX)) return -EINVAL; - /* pairs with cmpxchg() below */ + /* pairs with smp_store_release() below */ tfm = READ_ONCE(tfms[mode_num]); - if (likely(tfm != NULL)) - goto done; + if (likely(tfm != NULL)) { + ci->ci_ctfm = tfm; + return 0; + } + + mutex_lock(&fscrypt_mode_key_setup_mutex); + + if (tfms[mode_num]) + goto done_unlock; BUILD_BUG_ON(sizeof(mode_num) != 1); BUILD_BUG_ON(sizeof(sb->s_uuid) != 16); @@ -157,21 +166,21 @@ static int setup_per_mode_enc_key(struct fscrypt_info *ci, hkdf_context, hkdf_info, hkdf_infolen, mode_key, mode->keysize); if (err) - return err; + goto out_unlock; tfm = fscrypt_allocate_skcipher(mode, mode_key, inode); memzero_explicit(mode_key, mode->keysize); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - /* pairs with READ_ONCE() above */ - prev_tfm = cmpxchg(&tfms[mode_num], NULL, tfm); - if (prev_tfm != NULL) { - crypto_free_skcipher(tfm); - tfm = prev_tfm; + if (IS_ERR(tfm)) { + err = PTR_ERR(tfm); + goto out_unlock; } -done: + /* pairs with READ_ONCE() above */ + smp_store_release(&tfms[mode_num], tfm); +done_unlock: ci->ci_ctfm = tfm; - return 0; + err = 0; +out_unlock: + mutex_unlock(&fscrypt_mode_key_setup_mutex); + return err; } int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, @@ -189,6 +198,43 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, return 0; } +static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci, + struct fscrypt_master_key *mk) +{ + int err; + + err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_32_keys, + HKDF_CONTEXT_IV_INO_LBLK_32_KEY, true); + if (err) + return err; + + /* pairs with smp_store_release() below */ + if (!smp_load_acquire(&mk->mk_ino_hash_key_initialized)) { + + mutex_lock(&fscrypt_mode_key_setup_mutex); + + if (mk->mk_ino_hash_key_initialized) + goto unlock; + + err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, + HKDF_CONTEXT_INODE_HASH_KEY, NULL, 0, + (u8 *)&mk->mk_ino_hash_key, + sizeof(mk->mk_ino_hash_key)); + if (err) + goto unlock; + /* pairs with smp_load_acquire() above */ + smp_store_release(&mk->mk_ino_hash_key_initialized, true); +unlock: + mutex_unlock(&fscrypt_mode_key_setup_mutex); + if (err) + return err; + } + + ci->ci_hashed_ino = (u32)siphash_1u64(ci->ci_inode->i_ino, + &mk->mk_ino_hash_key); + return 0; +} + static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, struct fscrypt_master_key *mk) { @@ -203,7 +249,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, * encryption key. This ensures that the master key is * consistently used only for HKDF, avoiding key reuse issues. */ - err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_tfms, + err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_keys, HKDF_CONTEXT_DIRECT_KEY, false); } else if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { @@ -211,11 +257,14 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, * IV_INO_LBLK_64: encryption keys are derived from (master_key, * mode_num, filesystem_uuid), and inode number is included in * the IVs. This format is optimized for use with inline - * encryption hardware compliant with the UFS or eMMC standards. + * encryption hardware compliant with the UFS standard. */ - err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_tfms, + err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_keys, HKDF_CONTEXT_IV_INO_LBLK_64_KEY, true); + } else if (ci->ci_policy.v2.flags & + FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) { + err = fscrypt_setup_iv_ino_lblk_32_key(ci, mk); } else { u8 derived_key[FSCRYPT_MAX_KEY_SIZE]; @@ -395,21 +444,18 @@ int fscrypt_get_encryption_info(struct inode *inode) res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { - if (!fscrypt_dummy_context_enabled(inode) || - IS_ENCRYPTED(inode)) { + const union fscrypt_context *dummy_ctx = + fscrypt_get_dummy_context(inode->i_sb); + + if (IS_ENCRYPTED(inode) || !dummy_ctx) { fscrypt_warn(inode, "Error %d getting encryption context", res); return res; } /* Fake up a context for an unencrypted directory */ - memset(&ctx, 0, sizeof(ctx)); - ctx.version = FSCRYPT_CONTEXT_V1; - ctx.v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; - ctx.v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; - memset(ctx.v1.master_key_descriptor, 0x42, - FSCRYPT_KEY_DESCRIPTOR_SIZE); - res = sizeof(ctx.v1); + res = fscrypt_context_size(dummy_ctx); + memcpy(&ctx, dummy_ctx, res); } crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_NOFS); @@ -475,7 +521,8 @@ out: EXPORT_SYMBOL(fscrypt_get_encryption_info); /** - * fscrypt_put_encryption_info - free most of an inode's fscrypt data + * fscrypt_put_encryption_info() - free most of an inode's fscrypt data + * @inode: an inode being evicted * * Free the inode's fscrypt_info. Filesystems must call this when the inode is * being evicted. An RCU grace period need not have elapsed yet. @@ -488,7 +535,8 @@ void fscrypt_put_encryption_info(struct inode *inode) EXPORT_SYMBOL(fscrypt_put_encryption_info); /** - * fscrypt_free_inode - free an inode's fscrypt data requiring RCU delay + * fscrypt_free_inode() - free an inode's fscrypt data requiring RCU delay + * @inode: an inode being freed * * Free the inode's cached decrypted symlink target, if any. Filesystems must * call this after an RCU grace period, just before they free the inode. @@ -503,7 +551,8 @@ void fscrypt_free_inode(struct inode *inode) EXPORT_SYMBOL(fscrypt_free_inode); /** - * fscrypt_drop_inode - check whether the inode's master key has been removed + * fscrypt_drop_inode() - check whether the inode's master key has been removed + * @inode: an inode being considered for eviction * * Filesystems supporting fscrypt must call this from their ->drop_inode() * method so that encrypted inodes are evicted as soon as they're no longer in diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 10ccf945020c..d23ff162c78b 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -11,12 +11,15 @@ */ #include <linux/random.h> +#include <linux/seq_file.h> #include <linux/string.h> #include <linux/mount.h> #include "fscrypt_private.h" /** - * fscrypt_policies_equal - check whether two encryption policies are the same + * fscrypt_policies_equal() - check whether two encryption policies are the same + * @policy1: the first policy + * @policy2: the second policy * * Return: %true if equal, else %false */ @@ -66,18 +69,14 @@ static bool supported_direct_key_modes(const struct inode *inode, return true; } -static bool supported_iv_ino_lblk_64_policy( - const struct fscrypt_policy_v2 *policy, - const struct inode *inode) +static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy, + const struct inode *inode, + const char *type, + int max_ino_bits, int max_lblk_bits) { struct super_block *sb = inode->i_sb; int ino_bits = 64, lblk_bits = 64; - if (policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { - fscrypt_warn(inode, - "The DIRECT_KEY and IV_INO_LBLK_64 flags are mutually exclusive"); - return false; - } /* * It's unsafe to include inode numbers in the IVs if the filesystem can * potentially renumber inodes, e.g. via filesystem shrinking. @@ -85,16 +84,22 @@ static bool supported_iv_ino_lblk_64_policy( if (!sb->s_cop->has_stable_inodes || !sb->s_cop->has_stable_inodes(sb)) { fscrypt_warn(inode, - "Can't use IV_INO_LBLK_64 policy on filesystem '%s' because it doesn't have stable inode numbers", - sb->s_id); + "Can't use %s policy on filesystem '%s' because it doesn't have stable inode numbers", + type, sb->s_id); return false; } if (sb->s_cop->get_ino_and_lblk_bits) sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits); - if (ino_bits > 32 || lblk_bits > 32) { + if (ino_bits > max_ino_bits) { + fscrypt_warn(inode, + "Can't use %s policy on filesystem '%s' because its inode numbers are too long", + type, sb->s_id); + return false; + } + if (lblk_bits > max_lblk_bits) { fscrypt_warn(inode, - "Can't use IV_INO_LBLK_64 policy on filesystem '%s' because it doesn't use 32-bit inode and block numbers", - sb->s_id); + "Can't use %s policy on filesystem '%s' because its block numbers are too long", + type, sb->s_id); return false; } return true; @@ -137,6 +142,8 @@ static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy, static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, const struct inode *inode) { + int count = 0; + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, policy->filenames_encryption_mode)) { fscrypt_warn(inode, @@ -152,13 +159,29 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, return false; } + count += !!(policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY); + count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64); + count += !!(policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32); + if (count > 1) { + fscrypt_warn(inode, "Mutually exclusive encryption flags (0x%02x)", + policy->flags); + return false; + } + if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) && !supported_direct_key_modes(inode, policy->contents_encryption_mode, policy->filenames_encryption_mode)) return false; if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) && - !supported_iv_ino_lblk_64_policy(policy, inode)) + !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_64", + 32, 32)) + return false; + + if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) && + /* This uses hashed inode numbers, so ino_bits doesn't matter. */ + !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32", + INT_MAX, 32)) return false; if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) { @@ -170,7 +193,9 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, } /** - * fscrypt_supported_policy - check whether an encryption policy is supported + * fscrypt_supported_policy() - check whether an encryption policy is supported + * @policy_u: the encryption policy + * @inode: the inode on which the policy will be used * * Given an encryption policy, check whether all its encryption modes and other * settings are supported by this kernel on the given inode. (But we don't @@ -192,7 +217,10 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, } /** - * fscrypt_new_context_from_policy - create a new fscrypt_context from a policy + * fscrypt_new_context_from_policy() - create a new fscrypt_context from + * an fscrypt_policy + * @ctx_u: output context + * @policy_u: input policy * * Create an fscrypt_context for an inode that is being assigned the given * encryption policy. A new nonce is randomly generated. @@ -242,7 +270,11 @@ static int fscrypt_new_context_from_policy(union fscrypt_context *ctx_u, } /** - * fscrypt_policy_from_context - convert an fscrypt_context to an fscrypt_policy + * fscrypt_policy_from_context() - convert an fscrypt_context to + * an fscrypt_policy + * @policy_u: output policy + * @ctx_u: input context + * @ctx_size: size of input context in bytes * * Given an fscrypt_context, build the corresponding fscrypt_policy. * @@ -354,6 +386,9 @@ static int set_encryption_policy(struct inode *inode, policy->v2.master_key_identifier); if (err) return err; + if (policy->v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) + pr_warn_once("%s (pid %d) is setting an IV_INO_LBLK_32 encryption policy. This should only be used if there are certain hardware limitations.\n", + current->comm, current->pid); break; default: WARN_ON(1); @@ -605,3 +640,127 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, return preload ? fscrypt_get_encryption_info(child): 0; } EXPORT_SYMBOL(fscrypt_inherit_context); + +/** + * fscrypt_set_test_dummy_encryption() - handle '-o test_dummy_encryption' + * @sb: the filesystem on which test_dummy_encryption is being specified + * @arg: the argument to the test_dummy_encryption option. + * If no argument was specified, then @arg->from == NULL. + * @dummy_ctx: the filesystem's current dummy context (input/output, see below) + * + * Handle the test_dummy_encryption mount option by creating a dummy encryption + * context, saving it in @dummy_ctx, and adding the corresponding dummy + * encryption key to the filesystem. If the @dummy_ctx is already set, then + * instead validate that it matches @arg. Don't support changing it via + * remount, as that is difficult to do safely. + * + * The reason we use an fscrypt_context rather than an fscrypt_policy is because + * we mustn't generate a new nonce each time we access a dummy-encrypted + * directory, as that would change the way filenames are encrypted. + * + * Return: 0 on success (dummy context set, or the same context is already set); + * -EEXIST if a different dummy context is already set; + * or another -errno value. + */ +int fscrypt_set_test_dummy_encryption(struct super_block *sb, + const substring_t *arg, + struct fscrypt_dummy_context *dummy_ctx) +{ + const char *argstr = "v2"; + const char *argstr_to_free = NULL; + struct fscrypt_key_specifier key_spec = { 0 }; + int version; + union fscrypt_context *ctx = NULL; + int err; + + if (arg->from) { + argstr = argstr_to_free = match_strdup(arg); + if (!argstr) + return -ENOMEM; + } + + if (!strcmp(argstr, "v1")) { + version = FSCRYPT_CONTEXT_V1; + key_spec.type = FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR; + memset(key_spec.u.descriptor, 0x42, + FSCRYPT_KEY_DESCRIPTOR_SIZE); + } else if (!strcmp(argstr, "v2")) { + version = FSCRYPT_CONTEXT_V2; + key_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER; + /* key_spec.u.identifier gets filled in when adding the key */ + } else { + err = -EINVAL; + goto out; + } + + if (dummy_ctx->ctx) { + /* + * Note: if we ever make test_dummy_encryption support + * specifying other encryption settings, such as the encryption + * modes, we'll need to compare those settings here. + */ + if (dummy_ctx->ctx->version == version) + err = 0; + else + err = -EEXIST; + goto out; + } + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) { + err = -ENOMEM; + goto out; + } + + err = fscrypt_add_test_dummy_key(sb, &key_spec); + if (err) + goto out; + + ctx->version = version; + switch (ctx->version) { + case FSCRYPT_CONTEXT_V1: + ctx->v1.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; + ctx->v1.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; + memcpy(ctx->v1.master_key_descriptor, key_spec.u.descriptor, + FSCRYPT_KEY_DESCRIPTOR_SIZE); + break; + case FSCRYPT_CONTEXT_V2: + ctx->v2.contents_encryption_mode = FSCRYPT_MODE_AES_256_XTS; + ctx->v2.filenames_encryption_mode = FSCRYPT_MODE_AES_256_CTS; + memcpy(ctx->v2.master_key_identifier, key_spec.u.identifier, + FSCRYPT_KEY_IDENTIFIER_SIZE); + break; + default: + WARN_ON(1); + err = -EINVAL; + goto out; + } + dummy_ctx->ctx = ctx; + ctx = NULL; + err = 0; +out: + kfree(ctx); + kfree(argstr_to_free); + return err; +} +EXPORT_SYMBOL_GPL(fscrypt_set_test_dummy_encryption); + +/** + * fscrypt_show_test_dummy_encryption() - show '-o test_dummy_encryption' + * @seq: the seq_file to print the option to + * @sep: the separator character to use + * @sb: the filesystem whose options are being shown + * + * Show the test_dummy_encryption mount option, if it was specified. + * This is mainly used for /proc/mounts. + */ +void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep, + struct super_block *sb) +{ + const union fscrypt_context *ctx = fscrypt_get_dummy_context(sb); + + if (!ctx) + return; + seq_printf(seq, "%ctest_dummy_encryption=v%d", sep, ctx->version); +} +EXPORT_SYMBOL_GPL(fscrypt_show_test_dummy_encryption); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 2d357680094c..ae49a55bda00 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -506,20 +506,11 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); * This function creates a file in debugfs with the given name that * contains the value of the variable @value. If the @mode variable is so * set, it can be read from, and written to. - * - * This function will return a pointer to a dentry if it succeeds. This - * pointer must be passed to the debugfs_remove() function when the file is - * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be - * returned. - * - * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will - * be returned. */ -struct dentry *debugfs_create_u32(const char *name, umode_t mode, - struct dentry *parent, u32 *value) +void debugfs_create_u32(const char *name, umode_t mode, struct dentry *parent, + u32 *value) { - return debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u32, + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_u32, &fops_u32_ro, &fops_u32_wo); } EXPORT_SYMBOL_GPL(debugfs_create_u32); diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 2c449aed1b92..0681540c48d9 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -48,18 +48,6 @@ void ecryptfs_from_hex(char *dst, char *src, int dst_size) } } -static int ecryptfs_hash_digest(struct crypto_shash *tfm, - char *src, int len, char *dst) -{ - SHASH_DESC_ON_STACK(desc, tfm); - int err; - - desc->tfm = tfm; - err = crypto_shash_digest(desc, src, len, dst); - shash_desc_zero(desc); - return err; -} - /** * ecryptfs_calculate_md5 - calculates the md5 of @src * @dst: Pointer to 16 bytes of allocated memory @@ -74,11 +62,8 @@ static int ecryptfs_calculate_md5(char *dst, struct ecryptfs_crypt_stat *crypt_stat, char *src, int len) { - struct crypto_shash *tfm; - int rc = 0; + int rc = crypto_shash_tfm_digest(crypt_stat->hash_tfm, src, len, dst); - tfm = crypt_stat->hash_tfm; - rc = ecryptfs_hash_digest(tfm, src, len, dst); if (rc) { printk(KERN_ERR "%s: Error computing crypto hash; rc = [%d]\n", diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8c596641a72b..12eebcdea9c8 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1171,6 +1171,10 @@ static inline bool chain_epi_lockless(struct epitem *epi) { struct eventpoll *ep = epi->ep; + /* Fast preliminary check */ + if (epi->next != EP_UNACTIVE_PTR) + return false; + /* Check that the same epi has not been just chained from another CPU */ if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) return false; @@ -1237,16 +1241,12 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * chained in ep->ovflist and requeued later on. */ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { - if (epi->next == EP_UNACTIVE_PTR && - chain_epi_lockless(epi)) + if (chain_epi_lockless(epi)) + ep_pm_stay_awake_rcu(epi); + } else if (!ep_is_linked(epi)) { + /* In the usual case, add event to ready list. */ + if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) ep_pm_stay_awake_rcu(epi); - goto out_unlock; - } - - /* If this file is already in the ready list we exit soon */ - if (!ep_is_linked(epi) && - list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) { - ep_pm_stay_awake_rcu(epi); } /* @@ -1822,7 +1822,6 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, { int res = 0, eavail, timed_out = 0; u64 slack = 0; - bool waiter = false; wait_queue_entry_t wait; ktime_t expires, *to = NULL; @@ -1867,55 +1866,75 @@ fetch_events: */ ep_reset_busy_poll_napi_id(ep); - /* - * We don't have any available event to return to the caller. We need - * to sleep here, and we will be woken by ep_poll_callback() when events - * become available. - */ - if (!waiter) { - waiter = true; - init_waitqueue_entry(&wait, current); + do { + /* + * Internally init_wait() uses autoremove_wake_function(), + * thus wait entry is removed from the wait queue on each + * wakeup. Why it is important? In case of several waiters + * each new wakeup will hit the next waiter, giving it the + * chance to harvest new event. Otherwise wakeup can be + * lost. This is also good performance-wise, because on + * normal wakeup path no need to call __remove_wait_queue() + * explicitly, thus ep->lock is not taken, which halts the + * event delivery. + */ + init_wait(&wait); write_lock_irq(&ep->lock); - __add_wait_queue_exclusive(&ep->wq, &wait); - write_unlock_irq(&ep->lock); - } - - for (;;) { /* - * We don't want to sleep if the ep_poll_callback() sends us - * a wakeup in between. That's why we set the task state - * to TASK_INTERRUPTIBLE before doing the checks. + * Barrierless variant, waitqueue_active() is called under + * the same lock on wakeup ep_poll_callback() side, so it + * is safe to avoid an explicit barrier. */ - set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_INTERRUPTIBLE); + /* - * Always short-circuit for fatal signals to allow - * threads to make a timely exit without the chance of - * finding more events available and fetching - * repeatedly. + * Do the final check under the lock. ep_scan_ready_list() + * plays with two lists (->rdllist and ->ovflist) and there + * is always a race when both lists are empty for short + * period of time although events are pending, so lock is + * important. */ - if (fatal_signal_pending(current)) { - res = -EINTR; - break; + eavail = ep_events_available(ep); + if (!eavail) { + if (signal_pending(current)) + res = -EINTR; + else + __add_wait_queue_exclusive(&ep->wq, &wait); } + write_unlock_irq(&ep->lock); - eavail = ep_events_available(ep); - if (eavail) + if (eavail || res) break; - if (signal_pending(current)) { - res = -EINTR; - break; - } if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) { timed_out = 1; break; } - } + + /* We were woken up, thus go and try to harvest some events */ + eavail = 1; + + } while (0); __set_current_state(TASK_RUNNING); + if (!list_empty_careful(&wait.entry)) { + write_lock_irq(&ep->lock); + __remove_wait_queue(&ep->wq, &wait); + write_unlock_irq(&ep->lock); + } + send_events: + if (fatal_signal_pending(current)) { + /* + * Always short-circuit for fatal signals to allow + * threads to make a timely exit without the chance of + * finding more events available and fetching + * repeatedly. + */ + res = -EINTR; + } /* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of @@ -1925,12 +1944,6 @@ send_events: !(res = ep_send_events(ep, events, maxevents)) && !timed_out) goto fetch_events; - if (waiter) { - write_lock_irq(&ep->lock); - __remove_wait_queue(&ep->wq, &wait); - write_unlock_irq(&ep->lock); - } - return res; } diff --git a/fs/exec.c b/fs/exec.c index 06b4c550af5d..2c465119affc 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1317,6 +1317,8 @@ int flush_old_exec(struct linux_binprm * bprm) */ set_mm_exe_file(bprm->mm, bprm->file); + would_dump(bprm, bprm->file); + /* * Release all of the old mmap stuff */ @@ -1876,8 +1878,6 @@ static int __do_execve_file(int fd, struct filename *filename, if (retval < 0) goto out; - would_dump(bprm, bprm->file); - retval = exec_binprm(bprm); if (retval < 0) goto out; diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c index 6a04cc02565a..6774a5a6ded8 100644 --- a/fs/exfat/balloc.c +++ b/fs/exfat/balloc.c @@ -91,7 +91,6 @@ static int exfat_allocate_bitmap(struct super_block *sb, } } - sbi->pbr_bh = NULL; return 0; } @@ -137,8 +136,6 @@ void exfat_free_bitmap(struct exfat_sb_info *sbi) { int i; - brelse(sbi->pbr_bh); - for (i = 0; i < sbi->map_sectors; i++) __brelse(sbi->vol_amap[i]); diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 67d4e46fb810..d67fb8a6f770 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -507,6 +507,7 @@ void exfat_msg(struct super_block *sb, const char *lv, const char *fmt, ...) __printf(3, 4) __cold; void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, u8 tz, __le16 time, __le16 date, u8 time_ms); +void exfat_truncate_atime(struct timespec64 *ts); void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, u8 *tz, __le16 *time, __le16 *date, u8 *time_ms); unsigned short exfat_calc_chksum_2byte(void *data, int len, diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 483f683757aa..c9db8eb0cfc3 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -273,6 +273,7 @@ int exfat_getattr(const struct path *path, struct kstat *stat, struct exfat_inode_info *ei = EXFAT_I(inode); generic_fillattr(inode, stat); + exfat_truncate_atime(&stat->atime); stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; stat->btime.tv_nsec = ei->i_crtime.tv_nsec; @@ -339,6 +340,7 @@ int exfat_setattr(struct dentry *dentry, struct iattr *attr) } setattr_copy(inode, attr); + exfat_truncate_atime(&inode->i_atime); mark_inode_dirty(inode); out: @@ -346,12 +348,13 @@ out: } const struct file_operations exfat_file_operations = { - .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, - .fsync = generic_file_fsync, - .splice_read = generic_file_splice_read, + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .fsync = generic_file_fsync, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, }; const struct inode_operations exfat_file_inode_operations = { diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c index 14a3300848f6..ebd2cbe3cbc1 100644 --- a/fs/exfat/misc.c +++ b/fs/exfat/misc.c @@ -88,7 +88,8 @@ void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, if (time_ms) { ts->tv_sec += time_ms / 100; ts->tv_nsec = (time_ms % 100) * 10 * NSEC_PER_MSEC; - } + } else + ts->tv_nsec = 0; if (tz & EXFAT_TZ_VALID) /* Adjust timezone to UTC0. */ @@ -124,6 +125,17 @@ void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, *tz = EXFAT_TZ_VALID; } +/* + * The timestamp for access_time has double seconds granularity. + * (There is no 10msIncrement field for access_time unlike create/modify_time) + * atime also has only a 2-second resolution. + */ +void exfat_truncate_atime(struct timespec64 *ts) +{ + ts->tv_sec = round_down(ts->tv_sec, 2); + ts->tv_nsec = 0; +} + unsigned short exfat_calc_chksum_2byte(void *data, int len, unsigned short chksum, int type) { diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index a8681d91f569..a2659a8a68a1 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -595,6 +595,7 @@ static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode_inc_iversion(inode); inode->i_mtime = inode->i_atime = inode->i_ctime = EXFAT_I(inode)->i_crtime = current_time(inode); + exfat_truncate_atime(&inode->i_atime); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); @@ -691,6 +692,7 @@ static int exfat_find(struct inode *dir, struct qstr *qname, exfat_fs_error(sb, "non-zero size file starts with zero cluster (size : %llu, p_dir : %u, entry : 0x%08x)", i_size_read(dir), ei->dir.dir, ei->entry); + kfree(es); return -EIO; } @@ -854,6 +856,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry) inode_inc_iversion(dir); dir->i_mtime = dir->i_atime = current_time(dir); + exfat_truncate_atime(&dir->i_atime); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); else @@ -861,6 +864,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry) clear_nlink(inode); inode->i_mtime = inode->i_atime = current_time(inode); + exfat_truncate_atime(&inode->i_atime); exfat_unhash_inode(inode); exfat_d_version_set(dentry, inode_query_iversion(dir)); unlock: @@ -903,6 +907,7 @@ static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode_inc_iversion(inode); inode->i_mtime = inode->i_atime = inode->i_ctime = EXFAT_I(inode)->i_crtime = current_time(inode); + exfat_truncate_atime(&inode->i_atime); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); @@ -1019,6 +1024,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) inode_inc_iversion(dir); dir->i_mtime = dir->i_atime = current_time(dir); + exfat_truncate_atime(&dir->i_atime); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); else @@ -1027,6 +1033,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) clear_nlink(inode); inode->i_mtime = inode->i_atime = current_time(inode); + exfat_truncate_atime(&inode->i_atime); exfat_unhash_inode(inode); exfat_d_version_set(dentry, inode_query_iversion(dir)); unlock: @@ -1387,6 +1394,7 @@ static int exfat_rename(struct inode *old_dir, struct dentry *old_dentry, inode_inc_iversion(new_dir); new_dir->i_ctime = new_dir->i_mtime = new_dir->i_atime = EXFAT_I(new_dir)->i_crtime = current_time(new_dir); + exfat_truncate_atime(&new_dir->i_atime); if (IS_DIRSYNC(new_dir)) exfat_sync_inode(new_dir); else diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 16ed202ef527..a846ff555656 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -49,6 +49,7 @@ static void exfat_put_super(struct super_block *sb) sync_blockdev(sb->s_bdev); exfat_set_vol_flags(sb, VOL_CLEAN); exfat_free_bitmap(sbi); + brelse(sbi->pbr_bh); mutex_unlock(&sbi->s_lock); call_rcu(&sbi->rcu, exfat_delayed_free); @@ -100,7 +101,7 @@ static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf) int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag) { struct exfat_sb_info *sbi = EXFAT_SB(sb); - struct pbr64 *bpb; + struct pbr64 *bpb = (struct pbr64 *)sbi->pbr_bh->b_data; bool sync = 0; /* flags are not changed */ @@ -115,15 +116,6 @@ int exfat_set_vol_flags(struct super_block *sb, unsigned short new_flag) if (sb_rdonly(sb)) return 0; - if (!sbi->pbr_bh) { - sbi->pbr_bh = sb_bread(sb, 0); - if (!sbi->pbr_bh) { - exfat_msg(sb, KERN_ERR, "failed to read boot sector"); - return -ENOMEM; - } - } - - bpb = (struct pbr64 *)sbi->pbr_bh->b_data; bpb->bsx.vol_flags = cpu_to_le16(new_flag); if (new_flag == VOL_DIRTY && !buffer_dirty(sbi->pbr_bh)) @@ -159,7 +151,6 @@ static int exfat_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",iocharset=utf8"); else if (sbi->nls_io) seq_printf(m, ",iocharset=%s", sbi->nls_io->charset); - seq_printf(m, ",bps=%ld", sb->s_blocksize); if (opts->errors == EXFAT_ERRORS_CONT) seq_puts(m, ",errors=continue"); else if (opts->errors == EXFAT_ERRORS_PANIC) @@ -212,6 +203,12 @@ enum { Opt_errors, Opt_discard, Opt_time_offset, + + /* Deprecated options */ + Opt_utf8, + Opt_debug, + Opt_namecase, + Opt_codepage, }; static const struct constant_table exfat_param_enums[] = { @@ -232,6 +229,14 @@ static const struct fs_parameter_spec exfat_parameters[] = { fsparam_enum("errors", Opt_errors, exfat_param_enums), fsparam_flag("discard", Opt_discard), fsparam_s32("time_offset", Opt_time_offset), + __fsparam(NULL, "utf8", Opt_utf8, fs_param_deprecated, + NULL), + __fsparam(NULL, "debug", Opt_debug, fs_param_deprecated, + NULL), + __fsparam(fs_param_is_u32, "namecase", Opt_namecase, + fs_param_deprecated, NULL), + __fsparam(fs_param_is_u32, "codepage", Opt_codepage, + fs_param_deprecated, NULL), {} }; @@ -287,6 +292,11 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; opts->time_offset = result.int_32; break; + case Opt_utf8: + case Opt_debug: + case Opt_namecase: + case Opt_codepage: + break; default: return -EINVAL; } @@ -351,14 +361,15 @@ static int exfat_read_root(struct inode *inode) exfat_save_attr(inode, ATTR_SUBDIR); inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = current_time(inode); + exfat_truncate_atime(&inode->i_atime); exfat_cache_init_inode(inode); return 0; } -static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb, - struct buffer_head **prev_bh) +static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb) { - struct pbr *p_pbr = (struct pbr *) (*prev_bh)->b_data; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct pbr *p_pbr = (struct pbr *) (sbi->pbr_bh)->b_data; unsigned short logical_sect = 0; logical_sect = 1 << p_pbr->bsx.f64.sect_size_bits; @@ -378,26 +389,23 @@ static struct pbr *exfat_read_pbr_with_logical_sector(struct super_block *sb, } if (logical_sect > sb->s_blocksize) { - struct buffer_head *bh = NULL; - - __brelse(*prev_bh); - *prev_bh = NULL; + brelse(sbi->pbr_bh); + sbi->pbr_bh = NULL; if (!sb_set_blocksize(sb, logical_sect)) { exfat_msg(sb, KERN_ERR, "unable to set blocksize %u", logical_sect); return NULL; } - bh = sb_bread(sb, 0); - if (!bh) { + sbi->pbr_bh = sb_bread(sb, 0); + if (!sbi->pbr_bh) { exfat_msg(sb, KERN_ERR, "unable to read boot sector (logical sector size = %lu)", sb->s_blocksize); return NULL; } - *prev_bh = bh; - p_pbr = (struct pbr *) bh->b_data; + p_pbr = (struct pbr *)sbi->pbr_bh->b_data; } return p_pbr; } @@ -408,21 +416,20 @@ static int __exfat_fill_super(struct super_block *sb) int ret; struct pbr *p_pbr; struct pbr64 *p_bpb; - struct buffer_head *bh; struct exfat_sb_info *sbi = EXFAT_SB(sb); /* set block size to read super block */ sb_min_blocksize(sb, 512); /* read boot sector */ - bh = sb_bread(sb, 0); - if (!bh) { + sbi->pbr_bh = sb_bread(sb, 0); + if (!sbi->pbr_bh) { exfat_msg(sb, KERN_ERR, "unable to read boot sector"); return -EIO; } /* PRB is read */ - p_pbr = (struct pbr *)bh->b_data; + p_pbr = (struct pbr *)sbi->pbr_bh->b_data; /* check the validity of PBR */ if (le16_to_cpu((p_pbr->signature)) != PBR_SIGNATURE) { @@ -433,7 +440,7 @@ static int __exfat_fill_super(struct super_block *sb) /* check logical sector size */ - p_pbr = exfat_read_pbr_with_logical_sector(sb, &bh); + p_pbr = exfat_read_pbr_with_logical_sector(sb); if (!p_pbr) { ret = -EIO; goto free_bh; @@ -514,7 +521,7 @@ free_alloc_bitmap: free_upcase_table: exfat_free_upcase_table(sbi); free_bh: - brelse(bh); + brelse(sbi->pbr_bh); return ret; } @@ -531,17 +538,18 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) if (opts->discard) { struct request_queue *q = bdev_get_queue(sb->s_bdev); - if (!blk_queue_discard(q)) + if (!blk_queue_discard(q)) { exfat_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but the device does not support discard"); - opts->discard = 0; + opts->discard = 0; + } } sb->s_flags |= SB_NODIRATIME; sb->s_magic = EXFAT_SUPER_MAGIC; sb->s_op = &exfat_sops; - sb->s_time_gran = 1; + sb->s_time_gran = 10 * NSEC_PER_MSEC; sb->s_time_min = EXFAT_MIN_TIMESTAMP_SECS; sb->s_time_max = EXFAT_MAX_TIMESTAMP_SECS; @@ -605,6 +613,7 @@ put_inode: free_table: exfat_free_upcase_table(sbi); exfat_free_bitmap(sbi); + brelse(sbi->pbr_bh); check_nls_io: unload_nls(sbi->nls_io); @@ -717,6 +726,7 @@ static void __exit exit_exfat_fs(void) module_init(init_exfat_fs); module_exit(exit_exfat_fs); +MODULE_ALIAS_FS("exfat"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("exFAT filesystem support"); MODULE_AUTHOR("Samsung Electronics Co., Ltd."); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 91eb4381cae5..3147bb0cf82a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -722,7 +722,7 @@ enum { #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF /* Max logical block we can support */ -#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFF +#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFE /* * Structure of an inode on the disk @@ -1357,11 +1357,9 @@ struct ext4_super_block { */ #define EXT4_MF_MNTDIR_SAMPLED 0x0001 #define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ -#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004 #ifdef CONFIG_FS_ENCRYPTION -#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \ - EXT4_MF_TEST_DUMMY_ENCRYPTION)) +#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_ctx.ctx != NULL) #else #define DUMMY_ENCRYPTION_ENABLED(sbi) (0) #endif @@ -1551,6 +1549,9 @@ struct ext4_sb_info { struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; + /* Encryption context for '-o test_dummy_encryption' */ + struct fscrypt_dummy_context s_dummy_enc_ctx; + /* * Barrier between writepages ops and changing any inode's JOURNAL_DATA * or EXTENTS flag. diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f2b577b315a0..2b4b94542e34 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4832,6 +4832,28 @@ static const struct iomap_ops ext4_iomap_xattr_ops = { .iomap_begin = ext4_iomap_xattr_begin, }; +static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) +{ + u64 maxbytes; + + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + maxbytes = inode->i_sb->s_maxbytes; + else + maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; + + if (*len == 0) + return -EINVAL; + if (start > maxbytes) + return -EFBIG; + + /* + * Shrink request scope to what the fs can actually handle. + */ + if (*len > maxbytes || (maxbytes - *len) < start) + *len = maxbytes - start; + return 0; +} + static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, bool from_es_cache) { @@ -4852,6 +4874,15 @@ static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (fiemap_check_flags(fieinfo, ext4_fiemap_flags)) return -EBADR; + /* + * For bitmap files the maximum size limit could be smaller than + * s_maxbytes, so check len here manually instead of just relying on the + * generic check. + */ + error = ext4_fiemap_check_ranges(inode, start, &len); + if (error) + return error; + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; error = iomap_fiemap(inode, fieinfo, start, len, diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bfc1281fc4cb..0746532ba463 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -733,29 +733,6 @@ static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa) fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid); } -/* copied from fs/ioctl.c */ -static int fiemap_check_ranges(struct super_block *sb, - u64 start, u64 len, u64 *new_len) -{ - u64 maxbytes = (u64) sb->s_maxbytes; - - *new_len = len; - - if (len == 0) - return -EINVAL; - - if (start > maxbytes) - return -EFBIG; - - /* - * Shrink request scope to what the fs can actually handle. - */ - if (len > maxbytes || (maxbytes - len) < start) - *new_len = maxbytes - start; - - return 0; -} - /* So that the fiemap access checks can't overflow on 32 bit machines. */ #define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) @@ -765,8 +742,6 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) struct fiemap __user *ufiemap = (struct fiemap __user *) arg; struct fiemap_extent_info fieinfo = { 0, }; struct inode *inode = file_inode(filp); - struct super_block *sb = inode->i_sb; - u64 len; int error; if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) @@ -775,11 +750,6 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) return -EINVAL; - error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length, - &len); - if (error) - return error; - fieinfo.fi_flags = fiemap.fm_flags; fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; @@ -792,7 +762,8 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) filemap_write_and_wait(inode->i_mapping); - error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, len); + error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, + fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bf5fcb477f66..4a3d21972011 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1106,6 +1106,7 @@ static void ext4_put_super(struct super_block *sb) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); fs_put_dax(sbi->s_daxdev); + fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx); #ifdef CONFIG_UNICODE utf8_unload(sbi->s_encoding); #endif @@ -1389,9 +1390,10 @@ retry: return res; } -static bool ext4_dummy_context(struct inode *inode) +static const union fscrypt_context * +ext4_get_dummy_context(struct super_block *sb) { - return DUMMY_ENCRYPTION_ENABLED(EXT4_SB(inode->i_sb)); + return EXT4_SB(sb)->s_dummy_enc_ctx.ctx; } static bool ext4_has_stable_inodes(struct super_block *sb) @@ -1410,7 +1412,7 @@ static const struct fscrypt_operations ext4_cryptops = { .key_prefix = "ext4:", .get_context = ext4_get_context, .set_context = ext4_set_context, - .dummy_context = ext4_dummy_context, + .get_dummy_context = ext4_get_dummy_context, .empty_dir = ext4_empty_dir, .max_namelen = EXT4_NAME_LEN, .has_stable_inodes = ext4_has_stable_inodes, @@ -1605,6 +1607,7 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, + {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_nombcache, "nombcache"}, {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ @@ -1816,7 +1819,7 @@ static const struct mount_opts { {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, {Opt_max_dir_size_kb, 0, MOPT_GTE0}, - {Opt_test_dummy_encryption, 0, MOPT_GTE0}, + {Opt_test_dummy_encryption, 0, MOPT_STRING}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_err, 0, 0} }; @@ -1851,6 +1854,48 @@ static int ext4_sb_read_encoding(const struct ext4_super_block *es, } #endif +static int ext4_set_test_dummy_encryption(struct super_block *sb, + const char *opt, + const substring_t *arg, + bool is_remount) +{ +#ifdef CONFIG_FS_ENCRYPTION + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; + + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if (is_remount && !sbi->s_dummy_enc_ctx.ctx) { + ext4_msg(sb, KERN_WARNING, + "Can't set test_dummy_encryption on remount"); + return -1; + } + err = fscrypt_set_test_dummy_encryption(sb, arg, &sbi->s_dummy_enc_ctx); + if (err) { + if (err == -EEXIST) + ext4_msg(sb, KERN_WARNING, + "Can't change test_dummy_encryption on remount"); + else if (err == -EINVAL) + ext4_msg(sb, KERN_WARNING, + "Value of option \"%s\" is unrecognized", opt); + else + ext4_msg(sb, KERN_WARNING, + "Error processing option \"%s\" [%d]", + opt, err); + return -1; + } + ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled"); +#else + ext4_msg(sb, KERN_WARNING, + "Test dummy encryption mount option ignored"); +#endif + return 1; +} + static int handle_mount_opt(struct super_block *sb, char *opt, int token, substring_t *args, unsigned long *journal_devnum, unsigned int *journal_ioprio, int is_remount) @@ -2047,14 +2092,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); } else if (token == Opt_test_dummy_encryption) { -#ifdef CONFIG_FS_ENCRYPTION - sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION; - ext4_msg(sb, KERN_WARNING, - "Test dummy encryption mode enabled"); -#else - ext4_msg(sb, KERN_WARNING, - "Test dummy encryption mount option ignored"); -#endif + return ext4_set_test_dummy_encryption(sb, opt, &args[0], + is_remount); } else if (m->flags & MOPT_DATAJ) { if (is_remount) { if (!sbi->s_journal) @@ -2311,8 +2350,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb); if (test_opt(sb, DATA_ERR_ABORT)) SEQ_OPTS_PUTS("data_err=abort"); - if (DUMMY_ENCRYPTION_ENABLED(sbi)) - SEQ_OPTS_PUTS("test_dummy_encryption"); + + fscrypt_show_test_dummy_encryption(seq, sep, sb); ext4_show_quota_options(seq, sb); return 0; @@ -4780,6 +4819,7 @@ failed_mount: for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); #endif + fscrypt_free_dummy_context(&sbi->s_dummy_enc_ctx); ext4_blkdev_remove(sbi); brelse(bh); out_fail: diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 04bfaf63752c..6c9fc9e21c13 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -293,6 +293,7 @@ EXT4_ATTR_FEATURE(batched_discard); EXT4_ATTR_FEATURE(meta_bg_resize); #ifdef CONFIG_FS_ENCRYPTION EXT4_ATTR_FEATURE(encryption); +EXT4_ATTR_FEATURE(test_dummy_encryption_v2); #endif #ifdef CONFIG_UNICODE EXT4_ATTR_FEATURE(casefold); @@ -308,6 +309,7 @@ static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(meta_bg_resize), #ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), + ATTR_LIST(test_dummy_encryption_v2), #endif #ifdef CONFIG_UNICODE ATTR_LIST(casefold), diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ba470d5687fe..157eec348970 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -138,7 +138,7 @@ struct f2fs_mount_info { int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ int bggc_mode; /* bggc mode: off, on or sync */ - bool test_dummy_encryption; /* test dummy encryption */ + struct fscrypt_dummy_context dummy_enc_ctx; /* test dummy encryption */ block_t unusable_cap; /* Amount of space allowed to be * unusable when disabling checkpoint */ @@ -1259,7 +1259,7 @@ enum fsync_mode { #ifdef CONFIG_FS_ENCRYPTION #define DUMMY_ENCRYPTION_ENABLED(sbi) \ - (unlikely(F2FS_OPTION(sbi).test_dummy_encryption)) + (unlikely(F2FS_OPTION(sbi).dummy_enc_ctx.ctx != NULL)) #else #define DUMMY_ENCRYPTION_ENABLED(sbi) (0) #endif diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 5bc4dcd8fc03..8c4ea5003ef8 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -12,7 +12,6 @@ #include <linux/types.h> #include <linux/fs.h> #include <linux/f2fs_fs.h> -#include <linux/cryptohash.h> #include <linux/pagemap.h> #include <linux/unicode.h> diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f2dfc21c6abb..8a9955902d84 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -202,6 +202,7 @@ static match_table_t f2fs_tokens = { {Opt_whint, "whint_mode=%s"}, {Opt_alloc, "alloc_mode=%s"}, {Opt_fsync, "fsync_mode=%s"}, + {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_checkpoint_disable, "checkpoint=disable"}, {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"}, @@ -394,7 +395,52 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) } #endif -static int parse_options(struct super_block *sb, char *options) +static int f2fs_set_test_dummy_encryption(struct super_block *sb, + const char *opt, + const substring_t *arg, + bool is_remount) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); +#ifdef CONFIG_FS_ENCRYPTION + int err; + + if (!f2fs_sb_has_encrypt(sbi)) { + f2fs_err(sbi, "Encrypt feature is off"); + return -EINVAL; + } + + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if (is_remount && !F2FS_OPTION(sbi).dummy_enc_ctx.ctx) { + f2fs_warn(sbi, "Can't set test_dummy_encryption on remount"); + return -EINVAL; + } + err = fscrypt_set_test_dummy_encryption( + sb, arg, &F2FS_OPTION(sbi).dummy_enc_ctx); + if (err) { + if (err == -EEXIST) + f2fs_warn(sbi, + "Can't change test_dummy_encryption on remount"); + else if (err == -EINVAL) + f2fs_warn(sbi, "Value of option \"%s\" is unrecognized", + opt); + else + f2fs_warn(sbi, "Error processing option \"%s\" [%d]", + opt, err); + return -EINVAL; + } + f2fs_warn(sbi, "Test dummy encryption mode enabled"); +#else + f2fs_warn(sbi, "Test dummy encryption mount option ignored"); +#endif + return 0; +} + +static int parse_options(struct super_block *sb, char *options, bool is_remount) { struct f2fs_sb_info *sbi = F2FS_SB(sb); substring_t args[MAX_OPT_ARGS]; @@ -403,9 +449,7 @@ static int parse_options(struct super_block *sb, char *options) int arg = 0, ext_cnt; kuid_t uid; kgid_t gid; -#ifdef CONFIG_QUOTA int ret; -#endif if (!options) return 0; @@ -778,17 +822,10 @@ static int parse_options(struct super_block *sb, char *options) kvfree(name); break; case Opt_test_dummy_encryption: -#ifdef CONFIG_FS_ENCRYPTION - if (!f2fs_sb_has_encrypt(sbi)) { - f2fs_err(sbi, "Encrypt feature is off"); - return -EINVAL; - } - - F2FS_OPTION(sbi).test_dummy_encryption = true; - f2fs_info(sbi, "Test dummy encryption mode enabled"); -#else - f2fs_info(sbi, "Test dummy encryption mount option ignored"); -#endif + ret = f2fs_set_test_dummy_encryption(sb, p, &args[0], + is_remount); + if (ret) + return ret; break; case Opt_checkpoint_disable_cap_perc: if (args->from && match_int(args, &arg)) @@ -1213,6 +1250,7 @@ static void f2fs_put_super(struct super_block *sb) for (i = 0; i < MAXQUOTAS; i++) kvfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif + fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); @@ -1543,10 +1581,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",whint_mode=%s", "user-based"); else if (F2FS_OPTION(sbi).whint_mode == WHINT_MODE_FS) seq_printf(seq, ",whint_mode=%s", "fs-based"); -#ifdef CONFIG_FS_ENCRYPTION - if (F2FS_OPTION(sbi).test_dummy_encryption) - seq_puts(seq, ",test_dummy_encryption"); -#endif + + fscrypt_show_test_dummy_encryption(seq, ',', sbi->sb); if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_DEFAULT) seq_printf(seq, ",alloc_mode=%s", "default"); @@ -1575,7 +1611,6 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT; F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX; - F2FS_OPTION(sbi).test_dummy_encryption = false; F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID); F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID); F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4; @@ -1734,7 +1769,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) default_options(sbi); /* parse mount options */ - err = parse_options(sb, data); + err = parse_options(sb, data, true); if (err) goto restore_opts; checkpoint_changed = @@ -2410,9 +2445,10 @@ static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, ctx, len, fs_data, XATTR_CREATE); } -static bool f2fs_dummy_context(struct inode *inode) +static const union fscrypt_context * +f2fs_get_dummy_context(struct super_block *sb) { - return DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(inode)); + return F2FS_OPTION(F2FS_SB(sb)).dummy_enc_ctx.ctx; } static bool f2fs_has_stable_inodes(struct super_block *sb) @@ -2431,7 +2467,7 @@ static const struct fscrypt_operations f2fs_cryptops = { .key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, - .dummy_context = f2fs_dummy_context, + .get_dummy_context = f2fs_get_dummy_context, .empty_dir = f2fs_empty_dir, .max_namelen = F2FS_NAME_LEN, .has_stable_inodes = f2fs_has_stable_inodes, @@ -3366,7 +3402,7 @@ try_onemore: goto free_sb_buf; } - err = parse_options(sb, options); + err = parse_options(sb, options, false); if (err) goto free_options; @@ -3769,6 +3805,7 @@ free_options: for (i = 0; i < MAXQUOTAS; i++) kvfree(F2FS_OPTION(sbi).s_qf_names[i]); #endif + fscrypt_free_dummy_context(&F2FS_OPTION(sbi).dummy_enc_ctx); kvfree(options); free_sb_buf: kvfree(raw_super); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index e3bbbef9b4f0..3162f46b3c9b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -446,6 +446,7 @@ enum feat_id { FEAT_SB_CHECKSUM, FEAT_CASEFOLD, FEAT_COMPRESSION, + FEAT_TEST_DUMMY_ENCRYPTION_V2, }; static ssize_t f2fs_feature_show(struct f2fs_attr *a, @@ -466,6 +467,7 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a, case FEAT_SB_CHECKSUM: case FEAT_CASEFOLD: case FEAT_COMPRESSION: + case FEAT_TEST_DUMMY_ENCRYPTION_V2: return sprintf(buf, "supported\n"); } return 0; @@ -563,6 +565,7 @@ F2FS_GENERAL_RO_ATTR(avg_vblocks); #ifdef CONFIG_FS_ENCRYPTION F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); +F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2, FEAT_TEST_DUMMY_ENCRYPTION_V2); #endif #ifdef CONFIG_BLK_DEV_ZONED F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); @@ -647,6 +650,7 @@ ATTRIBUTE_GROUPS(f2fs); static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION ATTR_LIST(encryption), + ATTR_LIST(test_dummy_encryption_v2), #endif #ifdef CONFIG_BLK_DEV_ZONED ATTR_LIST(block_zoned), diff --git a/fs/file.c b/fs/file.c index c8a4e4c86e55..abb8b7081d7a 100644 --- a/fs/file.c +++ b/fs/file.c @@ -70,7 +70,7 @@ static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, */ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) { - unsigned int cpy, set; + size_t cpy, set; BUG_ON(nfdt->max_fds < ofdt->max_fds); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 936a8ec6b48e..6306eaae378b 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -528,10 +528,12 @@ lower_metapath: /* Advance in metadata tree. */ (mp->mp_list[hgt])++; - if (mp->mp_list[hgt] >= sdp->sd_inptrs) { - if (!hgt) + if (hgt) { + if (mp->mp_list[hgt] >= sdp->sd_inptrs) + goto lower_metapath; + } else { + if (mp->mp_list[hgt] >= sdp->sd_diptrs) break; - goto lower_metapath; } fill_up_metapath: @@ -876,10 +878,9 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, ret = -ENOENT; goto unlock; } else { - /* report a hole */ iomap->offset = pos; iomap->length = length; - goto do_alloc; + goto hole_found; } } iomap->length = size; @@ -933,8 +934,6 @@ unlock: return ret; do_alloc: - iomap->addr = IOMAP_NULL_ADDR; - iomap->type = IOMAP_HOLE; if (flags & IOMAP_REPORT) { if (pos >= size) ret = -ENOENT; @@ -956,6 +955,9 @@ do_alloc: if (pos < size && height == ip->i_height) ret = gfs2_hole_size(inode, lblock, len, mp, iomap); } +hole_found: + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; goto out; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 29f9b6684b74..bf70e3b14938 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -613,7 +613,7 @@ __acquires(&gl->gl_lockref.lock) fs_err(sdp, "Error %d syncing glock \n", ret); gfs2_dump_glock(NULL, gl, true); } - return; + goto skip_inval; } } if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) { @@ -633,6 +633,7 @@ __acquires(&gl->gl_lockref.lock) clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); } +skip_inval: gfs2_glock_hold(gl); /* * Check for an error encountered since we called go_sync and go_inval. @@ -722,9 +723,6 @@ __acquires(&gl->gl_lockref.lock) goto out_unlock; if (nonblock) goto out_sched; - smp_mb(); - if (atomic_read(&gl->gl_revokes) != 0) - goto out_sched; set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE); gl->gl_target = gl->gl_demote_state; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 70b2d3a1e866..5acd3ce30759 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -622,7 +622,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = finish_no_open(file, NULL); } gfs2_glock_dq_uninit(ghs); - return error; + goto fail; } else if (error != -ENOENT) { goto fail_gunlock; } @@ -764,9 +764,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = finish_open(file, dentry, gfs2_open_common); } gfs2_glock_dq_uninit(ghs); + gfs2_qa_put(ip); gfs2_glock_dq_uninit(ghs + 1); clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); gfs2_glock_put(io_gl); + gfs2_qa_put(dip); return error; fail_gunlock3: @@ -776,7 +778,6 @@ fail_gunlock2: clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); gfs2_glock_put(io_gl); fail_free_inode: - gfs2_qa_put(ip); if (ip->i_gl) { glock_clear_object(ip->i_gl, ip); gfs2_glock_put(ip->i_gl); @@ -1005,7 +1006,7 @@ out_gunlock: out_child: gfs2_glock_dq(ghs); out_parent: - gfs2_qa_put(ip); + gfs2_qa_put(dip); gfs2_holder_uninit(ghs); gfs2_holder_uninit(ghs + 1); return error; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 3a75843ae580..0644e58c6191 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -669,13 +669,13 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) struct buffer_head *bh = bd->bd_bh; struct gfs2_glock *gl = bd->bd_gl; + sdp->sd_log_num_revoke++; + if (atomic_inc_return(&gl->gl_revokes) == 1) + gfs2_glock_hold(gl); bh->b_private = NULL; bd->bd_blkno = bh->b_blocknr; gfs2_remove_from_ail(bd); /* drops ref on bh */ bd->bd_bh = NULL; - sdp->sd_log_num_revoke++; - if (atomic_inc_return(&gl->gl_revokes) == 1) - gfs2_glock_hold(gl); set_bit(GLF_LFLUSH, &gl->gl_flags); list_add(&bd->bd_list, &sdp->sd_log_revokes); } @@ -1131,6 +1131,10 @@ int gfs2_logd(void *data) while (!kthread_should_stop()) { + if (gfs2_withdrawn(sdp)) { + msleep_interruptible(HZ); + continue; + } /* Check for errors writing to the journal */ if (sdp->sd_log_error) { gfs2_lm(sdp, @@ -1139,6 +1143,7 @@ int gfs2_logd(void *data) "prevent further damage.\n", sdp->sd_fsname, sdp->sd_log_error); gfs2_withdraw(sdp); + continue; } did_flush = false; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 5ea96757afc4..cb2a11b458c6 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -263,7 +263,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno, struct super_block *sb = sdp->sd_vfs; struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - bio->bi_iter.bi_sector = blkno << (sb->s_blocksize_bits - 9); + bio->bi_iter.bi_sector = blkno << sdp->sd_fsb2bb_shift; bio_set_dev(bio, sb->s_bdev); bio->bi_end_io = end_io; bio->bi_private = sdp; @@ -509,12 +509,12 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, unsigned int bsize = sdp->sd_sb.sb_bsize, off; unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift; unsigned int shift = PAGE_SHIFT - bsize_shift; - unsigned int readahead_blocks = BIO_MAX_PAGES << shift; + unsigned int max_blocks = 2 * 1024 * 1024 >> bsize_shift; struct gfs2_journal_extent *je; int sz, ret = 0; struct bio *bio = NULL; struct page *page = NULL; - bool bio_chained = false, done = false; + bool done = false; errseq_t since; memset(head, 0, sizeof(*head)); @@ -537,30 +537,30 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, off = 0; } - if (!bio || (bio_chained && !off)) { - /* start new bio */ - } else { - sz = bio_add_page(bio, page, bsize, off); - if (sz == bsize) - goto block_added; + if (bio && (off || block < blocks_submitted + max_blocks)) { + sector_t sector = dblock << sdp->sd_fsb2bb_shift; + + if (bio_end_sector(bio) == sector) { + sz = bio_add_page(bio, page, bsize, off); + if (sz == bsize) + goto block_added; + } if (off) { unsigned int blocks = (PAGE_SIZE - off) >> bsize_shift; bio = gfs2_chain_bio(bio, blocks); - bio_chained = true; goto add_block_to_new_bio; } } if (bio) { - blocks_submitted = block + 1; + blocks_submitted = block; submit_bio(bio); } bio = gfs2_log_alloc_bio(sdp, dblock, gfs2_end_log_read); bio->bi_opf = REQ_OP_READ; - bio_chained = false; add_block_to_new_bio: sz = bio_add_page(bio, page, bsize, off); BUG_ON(sz != bsize); @@ -568,7 +568,7 @@ block_added: off += bsize; if (off == PAGE_SIZE) page = NULL; - if (blocks_submitted < blocks_read + readahead_blocks) { + if (blocks_submitted <= blocks_read + max_blocks) { /* Keep at least one bio in flight */ continue; } diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 4b72abcf83b2..9856cc2e0795 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -252,7 +252,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, int num = 0; if (unlikely(gfs2_withdrawn(sdp)) && - (!sdp->sd_jdesc || (blkno != sdp->sd_jdesc->jd_no_addr))) { + (!sdp->sd_jdesc || gl != sdp->sd_jinode_gl)) { *bhp = NULL; return -EIO; } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index cc0c4b5800be..8259fef3f986 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1051,8 +1051,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) u32 x; int error = 0; - if (capable(CAP_SYS_RESOURCE) || - sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) return 0; error = gfs2_quota_hold(ip, uid, gid); @@ -1125,7 +1124,7 @@ void gfs2_quota_unlock(struct gfs2_inode *ip) int found; if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags)) - goto out; + return; for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { struct gfs2_quota_data *qd; @@ -1162,7 +1161,6 @@ void gfs2_quota_unlock(struct gfs2_inode *ip) qd_unlock(qda[x]); } -out: gfs2_quota_unhold(ip); } @@ -1210,9 +1208,6 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, if (!test_bit(GIF_QD_LOCKED, &ip->i_flags)) return 0; - if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) - return 0; - for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { qd = ip->i_qadata->qa_qd[x]; @@ -1270,7 +1265,9 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, if (ip->i_diskflags & GFS2_DIF_SYSTEM) return; - BUG_ON(ip->i_qadata->qa_ref <= 0); + if (gfs2_assert_withdraw(sdp, ip->i_qadata && + ip->i_qadata->qa_ref > 0)) + return; for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { qd = ip->i_qadata->qa_qd[x]; diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 7f9ca8ef40fc..21ada332d555 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -44,7 +44,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, int ret; ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */ - if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + if (capable(CAP_SYS_RESOURCE) || + sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (ret) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 37fc41632aa2..956fced0a8ec 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1404,7 +1404,6 @@ out: if (ip->i_qadata) gfs2_assert_warn(sdp, ip->i_qadata->qa_ref == 0); gfs2_rs_delete(ip, NULL); - gfs2_qa_put(ip); gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 9b64d40ab379..aa087a5675af 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -119,6 +119,12 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) if (!sb_rdonly(sdp->sd_vfs)) ret = gfs2_make_fs_ro(sdp); + if (sdp->sd_lockstruct.ls_ops->lm_lock == NULL) { /* lock_nolock */ + if (!ret) + ret = -EIO; + clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags); + goto skip_recovery; + } /* * Drop the glock for our journal so another node can recover it. */ @@ -159,10 +165,6 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) wait_on_bit(&gl->gl_flags, GLF_FREEING, TASK_UNINTERRUPTIBLE); } - if (sdp->sd_lockstruct.ls_ops->lm_lock == NULL) { /* lock_nolock */ - clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags); - goto skip_recovery; - } /* * Dequeue the "live" glock, but keep a reference so it's never freed. */ diff --git a/fs/io_uring.c b/fs/io_uring.c index 381d50becd04..bb25e3997d41 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -524,6 +524,7 @@ enum { REQ_F_OVERFLOW_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, + REQ_F_NO_FILE_TABLE_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -577,6 +578,8 @@ enum { REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), /* buffer already selected */ REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), + /* doesn't need file table for this request */ + REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), }; struct async_poll { @@ -616,6 +619,8 @@ struct io_kiocb { bool needs_fixed_file; u8 opcode; + u16 buf_index; + struct io_ring_ctx *ctx; struct list_head list; unsigned int flags; @@ -677,8 +682,6 @@ struct io_op_def { unsigned needs_mm : 1; /* needs req->file assigned */ unsigned needs_file : 1; - /* needs req->file assigned IFF fd is >= 0 */ - unsigned fd_non_neg : 1; /* hash wq insertion if file is a regular file */ unsigned hash_reg_file : 1; /* unbound wq insertion if file is a non-regular file */ @@ -781,8 +784,6 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, }, [IORING_OP_OPENAT] = { - .needs_file = 1, - .fd_non_neg = 1, .file_table = 1, .needs_fs = 1, }, @@ -796,9 +797,8 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_STATX] = { .needs_mm = 1, - .needs_file = 1, - .fd_non_neg = 1, .needs_fs = 1, + .file_table = 1, }, [IORING_OP_READ] = { .needs_mm = 1, @@ -833,8 +833,6 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, }, [IORING_OP_OPENAT2] = { - .needs_file = 1, - .fd_non_neg = 1, .file_table = 1, .needs_fs = 1, }, @@ -928,6 +926,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) goto err; ctx->flags = p->flags; + init_waitqueue_head(&ctx->sqo_wait); init_waitqueue_head(&ctx->cq_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->completions[0]); @@ -1291,7 +1290,7 @@ static struct io_kiocb *io_get_fallback_req(struct io_ring_ctx *ctx) struct io_kiocb *req; req = ctx->fallback_req; - if (!test_and_set_bit_lock(0, (unsigned long *) ctx->fallback_req)) + if (!test_and_set_bit_lock(0, (unsigned long *) &ctx->fallback_req)) return req; return NULL; @@ -1378,7 +1377,7 @@ static void __io_free_req(struct io_kiocb *req) if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); else - clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req); + clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req); } struct req_batch { @@ -1398,10 +1397,6 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb) for (i = 0; i < rb->to_free; i++) { struct io_kiocb *req = rb->reqs[i]; - if (req->flags & REQ_F_FIXED_FILE) { - req->file = NULL; - percpu_ref_put(req->fixed_file_refs); - } if (req->flags & REQ_F_INFLIGHT) inflight++; __io_req_aux_free(req); @@ -1674,7 +1669,7 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req) if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req)) return false; - if (!(req->flags & REQ_F_FIXED_FILE) || req->io) + if (req->file || req->io) rb->need_iter++; rb->reqs[rb->to_free++] = req; @@ -2034,7 +2029,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) * any file. For now, just ensure that anything potentially problematic is done * inline. */ -static bool io_file_supports_async(struct file *file) +static bool io_file_supports_async(struct file *file, int rw) { umode_t mode = file_inode(file)->i_mode; @@ -2043,7 +2038,13 @@ static bool io_file_supports_async(struct file *file) if (S_ISREG(mode) && file->f_op != &io_uring_fops) return true; - return false; + if (!(file->f_mode & FMODE_NOWAIT)) + return false; + + if (rw == READ) + return file->f_op->read_iter != NULL; + + return file->f_op->write_iter != NULL; } static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, @@ -2102,9 +2103,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, req->rw.addr = READ_ONCE(sqe->addr); req->rw.len = READ_ONCE(sqe->len); - /* we own ->private, reuse it for the buffer index / buffer ID */ - req->rw.kiocb.private = (void *) (unsigned long) - READ_ONCE(sqe->buf_index); + req->buf_index = READ_ONCE(sqe->buf_index); return 0; } @@ -2147,7 +2146,7 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw, struct io_ring_ctx *ctx = req->ctx; size_t len = req->rw.len; struct io_mapped_ubuf *imu; - unsigned index, buf_index; + u16 index, buf_index; size_t offset; u64 buf_addr; @@ -2155,7 +2154,7 @@ static ssize_t io_import_fixed(struct io_kiocb *req, int rw, if (unlikely(!ctx->user_bufs)) return -EFAULT; - buf_index = (unsigned long) req->rw.kiocb.private; + buf_index = req->buf_index; if (unlikely(buf_index >= ctx->nr_user_bufs)) return -EFAULT; @@ -2271,10 +2270,10 @@ static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, bool needs_lock) { struct io_buffer *kbuf; - int bgid; + u16 bgid; kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; - bgid = (int) (unsigned long) req->rw.kiocb.private; + bgid = req->buf_index; kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock); if (IS_ERR(kbuf)) return kbuf; @@ -2365,7 +2364,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, } /* buffer index only valid with fixed read/write, or buffer select */ - if (req->rw.kiocb.private && !(req->flags & REQ_F_BUFFER_SELECT)) + if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { @@ -2571,7 +2570,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so * we know to async punt it even if it was opened O_NONBLOCK */ - if (force_nonblock && !io_file_supports_async(req->file)) + if (force_nonblock && !io_file_supports_async(req->file, READ)) goto copy_iov; iov_count = iov_iter_count(&iter); @@ -2594,7 +2593,8 @@ copy_iov: if (ret) goto out_free; /* any defer here is final, must blocking retry */ - if (!(req->flags & REQ_F_NOWAIT)) + if (!(req->flags & REQ_F_NOWAIT) && + !file_can_poll(req->file)) req->flags |= REQ_F_MUST_PUNT; return -EAGAIN; } @@ -2662,7 +2662,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so * we know to async punt it even if it was opened O_NONBLOCK */ - if (force_nonblock && !io_file_supports_async(req->file)) + if (force_nonblock && !io_file_supports_async(req->file, WRITE)) goto copy_iov; /* file path doesn't support NOWAIT for non-direct_IO */ @@ -2716,7 +2716,8 @@ copy_iov: if (ret) goto out_free; /* any defer here is final, must blocking retry */ - req->flags |= REQ_F_MUST_PUNT; + if (!file_can_poll(req->file)) + req->flags |= REQ_F_MUST_PUNT; return -EAGAIN; } } @@ -2756,15 +2757,6 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static bool io_splice_punt(struct file *file) -{ - if (get_pipe_info(file)) - return false; - if (!io_file_supports_async(file)) - return true; - return !(file->f_flags & O_NONBLOCK); -} - static int io_splice(struct io_kiocb *req, bool force_nonblock) { struct io_splice *sp = &req->splice; @@ -2772,19 +2764,16 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock) struct file *out = sp->file_out; unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; loff_t *poff_in, *poff_out; - long ret; + long ret = 0; - if (force_nonblock) { - if (io_splice_punt(in) || io_splice_punt(out)) - return -EAGAIN; - flags |= SPLICE_F_NONBLOCK; - } + if (force_nonblock) + return -EAGAIN; poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; - ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; + + if (sp->len) + ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED)); req->flags &= ~REQ_F_NEED_CLEANUP; @@ -3355,8 +3344,12 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock) struct kstat stat; int ret; - if (force_nonblock) + if (force_nonblock) { + /* only need file table for an actual valid fd */ + if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD) + req->flags |= REQ_F_NO_FILE_TABLE; return -EAGAIN; + } if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags)) return -EINVAL; @@ -3502,7 +3495,7 @@ static void io_sync_file_range_finish(struct io_wq_work **workptr) if (io_req_cancelled(req)) return; __io_sync_file_range(req); - io_put_req(req); /* put submission ref */ + io_steal_work(req, workptr); } static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) @@ -4142,12 +4135,14 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, req->result = mask; init_task_work(&req->task_work, func); /* - * If this fails, then the task is exiting. Punt to one of the io-wq - * threads to ensure the work gets run, we can't always rely on exit - * cancelation taking care of this. + * If this fails, then the task is exiting. When a task exits, the + * work gets canceled, so just cancel this request as well instead + * of executing it. We can't safely execute it anyway, as we may not + * have the needed state needed for it anyway. */ ret = task_work_add(tsk, &req->task_work, true); if (unlikely(ret)) { + WRITE_ONCE(poll->canceled, true); tsk = io_wq_get_task(req->ctx->io_wq); task_work_add(tsk, &req->task_work, true); } @@ -4200,17 +4195,17 @@ static void io_async_task_func(struct callback_head *cb) spin_unlock_irq(&ctx->completion_lock); + /* restore ->work in case we need to retry again */ + memcpy(&req->work, &apoll->work, sizeof(req->work)); + if (canceled) { kfree(apoll); io_cqring_ev_posted(ctx); req_set_fail_links(req); - io_put_req(req); + io_double_put_req(req); return; } - /* restore ->work in case we need to retry again */ - memcpy(&req->work, &apoll->work, sizeof(req->work)); - __set_current_state(TASK_RUNNING); mutex_lock(&ctx->uring_lock); __io_queue_sqe(req, NULL); @@ -4369,7 +4364,7 @@ static bool io_poll_remove_one(struct io_kiocb *req) hash_del(&req->hash_node); - if (apoll) { + if (do_complete && apoll) { /* * restore ->work because we need to call io_req_work_drop_env. */ @@ -5015,15 +5010,16 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) int ret; /* Still need defer if there is pending req in defer list. */ - if (!req_need_defer(req) && list_empty(&ctx->defer_list)) + if (!req_need_defer(req) && list_empty_careful(&ctx->defer_list)) return 0; - if (!req->io && io_alloc_async_ctx(req)) - return -EAGAIN; - - ret = io_req_defer_prep(req, sqe); - if (ret < 0) - return ret; + if (!req->io) { + if (io_alloc_async_ctx(req)) + return -EAGAIN; + ret = io_req_defer_prep(req, sqe); + if (ret < 0) + return ret; + } spin_lock_irq(&ctx->completion_lock); if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { @@ -5310,7 +5306,8 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (ret) return ret; - if (ctx->flags & IORING_SETUP_IOPOLL) { + /* If the op doesn't have a file, we're not polling for it */ + if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) { const bool in_async = io_wq_current_is_worker(); if (req->result == -EAGAIN) @@ -5364,15 +5361,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_steal_work(req, workptr); } -static int io_req_needs_file(struct io_kiocb *req, int fd) -{ - if (!io_op_defs[req->opcode].needs_file) - return 0; - if ((fd == -1 || fd == AT_FDCWD) && io_op_defs[req->opcode].fd_non_neg) - return 0; - return 1; -} - static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, int index) { @@ -5410,14 +5398,11 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req, } static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, - int fd, unsigned int flags) + int fd) { bool fixed; - if (!io_req_needs_file(req, fd)) - return 0; - - fixed = (flags & IOSQE_FIXED_FILE); + fixed = (req->flags & REQ_F_FIXED_FILE) != 0; if (unlikely(!fixed && req->needs_fixed_file)) return -EBADF; @@ -5429,7 +5414,7 @@ static int io_grab_files(struct io_kiocb *req) int ret = -EBADF; struct io_ring_ctx *ctx = req->ctx; - if (req->work.files) + if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE)) return 0; if (!ctx->ring_file) return -EBADF; @@ -5623,9 +5608,15 @@ fail_req: io_double_put_req(req); } } else if (req->flags & REQ_F_FORCE_ASYNC) { - ret = io_req_defer_prep(req, sqe); - if (unlikely(ret < 0)) - goto fail_req; + if (!req->io) { + ret = -EAGAIN; + if (io_alloc_async_ctx(req)) + goto fail_req; + ret = io_req_defer_prep(req, sqe); + if (unlikely(ret < 0)) + goto fail_req; + } + /* * Never try inline submit of IOSQE_ASYNC is set, go straight * to async execution. @@ -5794,7 +5785,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, struct io_submit_state *state, bool async) { unsigned int sqe_flags; - int id, fd; + int id; /* * All io need record the previous position, if LINK vs DARIN, @@ -5846,8 +5837,10 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, IOSQE_ASYNC | IOSQE_FIXED_FILE | IOSQE_BUFFER_SELECT | IOSQE_IO_LINK); - fd = READ_ONCE(sqe->fd); - return io_req_set_file(state, req, fd, sqe_flags); + if (!io_op_defs[req->opcode].needs_file) + return 0; + + return io_req_set_file(state, req, READ_ONCE(sqe->fd)); } static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, @@ -6039,6 +6032,7 @@ static int io_sq_thread(void *data) finish_wait(&ctx->sqo_wait, &wait); ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; + ret = 0; continue; } finish_wait(&ctx->sqo_wait, &wait); @@ -6852,7 +6846,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, { int ret; - init_waitqueue_head(&ctx->sqo_wait); mmgrab(current->mm); ctx->sqo_mm = current->mm; @@ -7327,7 +7320,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) * it could cause shutdown to hang. */ while (ctx->sqo_thread && !wq_has_sleeper(&ctx->sqo_wait)) - cpu_relax(); + cond_resched(); io_kill_timeouts(ctx); io_poll_remove_all(ctx); @@ -7356,11 +7349,9 @@ static int io_uring_release(struct inode *inode, struct file *file) static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { - struct io_kiocb *req; - DEFINE_WAIT(wait); - while (!list_empty_careful(&ctx->inflight_list)) { - struct io_kiocb *cancel_req = NULL; + struct io_kiocb *cancel_req = NULL, *req; + DEFINE_WAIT(wait); spin_lock_irq(&ctx->inflight_lock); list_for_each_entry(req, &ctx->inflight_list, inflight_entry) { @@ -7400,6 +7391,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, */ if (refcount_sub_and_test(2, &cancel_req->refs)) { io_put_req(cancel_req); + finish_wait(&ctx->inflight_wait, &wait); continue; } } @@ -7407,8 +7399,8 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, io_wq_cancel_work(ctx->io_wq, &cancel_req->work); io_put_req(cancel_req); schedule(); + finish_wait(&ctx->inflight_wait, &wait); } - finish_wait(&ctx->inflight_wait, &wait); } static int io_uring_flush(struct file *file, void *data) @@ -7757,7 +7749,8 @@ err: return ret; } -static int io_uring_create(unsigned entries, struct io_uring_params *p) +static int io_uring_create(unsigned entries, struct io_uring_params *p, + struct io_uring_params __user *params) { struct user_struct *user = NULL; struct io_ring_ctx *ctx; @@ -7849,6 +7842,14 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); p->cq_off.cqes = offsetof(struct io_rings, cqes); + p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | + IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | + IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL; + + if (copy_to_user(params, p, sizeof(*p))) { + ret = -EFAULT; + goto err; + } /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup @@ -7857,9 +7858,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) if (ret < 0) goto err; - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | - IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | - IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: @@ -7875,7 +7873,6 @@ err: static long io_uring_setup(u32 entries, struct io_uring_params __user *params) { struct io_uring_params p; - long ret; int i; if (copy_from_user(&p, params, sizeof(p))) @@ -7890,14 +7887,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ)) return -EINVAL; - ret = io_uring_create(entries, &p); - if (ret < 0) - return ret; - - if (copy_to_user(params, &p, sizeof(p))) - return -EFAULT; - - return ret; + return io_uring_create(entries, &p, params); } SYSCALL_DEFINE2(io_uring_setup, u32, entries, diff --git a/fs/ioctl.c b/fs/ioctl.c index 282d45be6f45..5e80b40bc1b5 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -55,6 +55,7 @@ EXPORT_SYMBOL(vfs_ioctl); static int ioctl_fibmap(struct file *filp, int __user *p) { struct inode *inode = file_inode(filp); + struct super_block *sb = inode->i_sb; int error, ur_block; sector_t block; @@ -71,6 +72,13 @@ static int ioctl_fibmap(struct file *filp, int __user *p) block = ur_block; error = bmap(inode, &block); + if (block > INT_MAX) { + error = -ERANGE; + pr_warn_ratelimited("[%s/%d] FS: %s File: %pD4 would truncate fibmap result\n", + current->comm, task_pid_nr(current), + sb->s_id, filp); + } + if (error) ur_block = 0; else diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index bccf305ea9ce..d55e8f491a5e 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -117,10 +117,7 @@ iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, if (iomap->type == IOMAP_MAPPED) { addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits; - if (addr > INT_MAX) - WARN(1, "would truncate bmap result\n"); - else - *bno = addr; + *bno = addr; } return 0; } diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 1abf126c2df4..a60df88efc40 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -118,8 +118,6 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int nfss->fscache_key = NULL; nfss->fscache = NULL; - if (!(nfss->options & NFS_OPTION_FSCACHE)) - return; if (!uniq) { uniq = ""; ulen = 1; @@ -188,7 +186,8 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int /* create a cache index for looking up filehandles */ nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache, &nfs_fscache_super_index_def, - key, sizeof(*key) + ulen, + &key->key, + sizeof(key->key) + ulen, NULL, 0, nfss, 0, true); dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n", @@ -226,6 +225,19 @@ void nfs_fscache_release_super_cookie(struct super_block *sb) } } +static void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata, + struct nfs_inode *nfsi) +{ + memset(auxdata, 0, sizeof(*auxdata)); + auxdata->mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; + auxdata->mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; + auxdata->ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; + auxdata->ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; + + if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) + auxdata->change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); +} + /* * Initialise the per-inode cache cookie pointer for an NFS inode. */ @@ -239,14 +251,7 @@ void nfs_fscache_init_inode(struct inode *inode) if (!(nfss->fscache && S_ISREG(inode->i_mode))) return; - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; - auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; - auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; - auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; - - if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4) - auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode); + nfs_fscache_update_auxdata(&auxdata, nfsi); nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache, &nfs_fscache_inode_object_def, @@ -266,11 +271,7 @@ void nfs_fscache_clear_inode(struct inode *inode) dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie); - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; - auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; - auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; - auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; + nfs_fscache_update_auxdata(&auxdata, nfsi); fscache_relinquish_cookie(cookie, &auxdata, false); nfsi->fscache = NULL; } @@ -310,11 +311,7 @@ void nfs_fscache_open_file(struct inode *inode, struct file *filp) if (!fscache_cookie_valid(cookie)) return; - memset(&auxdata, 0, sizeof(auxdata)); - auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec; - auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec; - auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec; - auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec; + nfs_fscache_update_auxdata(&auxdata, nfsi); if (inode_is_open_for_write(inode)) { dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi); diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 35c8cb2d7637..dda5c3e65d8d 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -30,6 +30,7 @@ #define encode_dirpath_sz (1 + XDR_QUADLEN(MNTPATHLEN)) #define MNT_status_sz (1) #define MNT_fhandle_sz XDR_QUADLEN(NFS2_FHSIZE) +#define MNT_fhandlev3_sz XDR_QUADLEN(NFS3_FHSIZE) #define MNT_authflav3_sz (1 + NFS_MAX_SECFLAVORS) /* @@ -37,7 +38,7 @@ */ #define MNT_enc_dirpath_sz encode_dirpath_sz #define MNT_dec_mountres_sz (MNT_status_sz + MNT_fhandle_sz) -#define MNT_dec_mountres3_sz (MNT_status_sz + MNT_fhandle_sz + \ +#define MNT_dec_mountres3_sz (MNT_status_sz + MNT_fhandlev3_sz + \ MNT_authflav3_sz) /* diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index c5c3fc6e6c60..26c94b32d6f4 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -253,37 +253,45 @@ int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type) { - struct posix_acl *alloc = NULL, *dfacl = NULL; + struct posix_acl *orig = acl, *dfacl = NULL, *alloc; int status; if (S_ISDIR(inode->i_mode)) { switch(type) { case ACL_TYPE_ACCESS: - alloc = dfacl = get_acl(inode, ACL_TYPE_DEFAULT); + alloc = get_acl(inode, ACL_TYPE_DEFAULT); if (IS_ERR(alloc)) goto fail; + dfacl = alloc; break; case ACL_TYPE_DEFAULT: - dfacl = acl; - alloc = acl = get_acl(inode, ACL_TYPE_ACCESS); + alloc = get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(alloc)) goto fail; + dfacl = acl; + acl = alloc; break; } } if (acl == NULL) { - alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); + alloc = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); if (IS_ERR(alloc)) goto fail; + acl = alloc; } status = __nfs3_proc_setacls(inode, acl, dfacl); - posix_acl_release(alloc); +out: + if (acl != orig) + posix_acl_release(acl); + if (dfacl != orig) + posix_acl_release(dfacl); return status; fail: - return PTR_ERR(alloc); + status = PTR_ERR(alloc); + goto out; } const struct xattr_handler *nfs3_xattr_handlers[] = { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 512afb1c7867..9056f3dd380e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6347,7 +6347,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, .rpc_client = server->client, .rpc_message = &msg, .callback_ops = &nfs4_delegreturn_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | RPC_TASK_TIMEOUT, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, }; int status = 0; @@ -7891,6 +7891,7 @@ static void nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata) { struct nfs41_bind_conn_to_session_args *args = task->tk_msg.rpc_argp; + struct nfs41_bind_conn_to_session_res *res = task->tk_msg.rpc_resp; struct nfs_client *clp = args->client; switch (task->tk_status) { @@ -7899,6 +7900,12 @@ nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata) nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); } + if (args->dir == NFS4_CDFC4_FORE_OR_BOTH && + res->dir != NFS4_CDFS4_BOTH) { + rpc_task_close_connection(task); + if (args->retries++ < MAX_BIND_CONN_TO_SESSION_RETRIES) + rpc_restart_call(task); + } } static const struct rpc_call_ops nfs4_bind_one_conn_to_session_ops = { @@ -7921,6 +7928,7 @@ int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt, struct nfs41_bind_conn_to_session_args args = { .client = clp, .dir = NFS4_CDFC4_FORE_OR_BOTH, + .retries = 0, }; struct nfs41_bind_conn_to_session_res res; struct rpc_message msg = { @@ -9191,8 +9199,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0); task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return ERR_CAST(task); + status = rpc_wait_for_completion_task(task); if (status != 0) goto out; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index ac93715c05a4..a8dc25ce48bb 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -734,9 +734,9 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner) state = new; state->owner = owner; atomic_inc(&owner->so_count); - list_add_rcu(&state->inode_states, &nfsi->open_states); ihold(inode); state->inode = inode; + list_add_rcu(&state->inode_states, &nfsi->open_states); spin_unlock(&inode->i_lock); /* Note: The reclaim code dictates that we add stateless * and read-only stateids to the end of the list */ diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index f61f96603df7..6ca421cbe19c 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -752,7 +752,7 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, .callback_ops = call_ops, .callback_data = hdr, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | flags, + .flags = RPC_TASK_ASYNC | flags, }; hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how); @@ -950,7 +950,8 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) hdr->cred, NFS_PROTO(hdr->inode), desc->pg_rpc_callops, - desc->pg_ioflags, 0); + desc->pg_ioflags, + RPC_TASK_CRED_NOREF); return ret; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index b8d78f393365..dd2e14f5875d 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1332,13 +1332,15 @@ _pnfs_return_layout(struct inode *ino) !valid_layout) { spin_unlock(&ino->i_lock); dprintk("NFS: %s no layout segments to return\n", __func__); - goto out_put_layout_hdr; + goto out_wait_layoutreturn; } send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL); spin_unlock(&ino->i_lock); if (send) status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, true); +out_wait_layoutreturn: + wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE); out_put_layout_hdr: pnfs_free_lseg_list(&tmp_list); pnfs_put_layout_hdr(lo); @@ -1456,18 +1458,15 @@ retry: /* lo ref dropped in pnfs_roc_release() */ layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &lc_cred, &iomode); /* If the creds don't match, we can't compound the layoutreturn */ - if (!layoutreturn) + if (!layoutreturn || cred_fscmp(cred, lc_cred) != 0) goto out_noroc; - if (cred_fscmp(cred, lc_cred) != 0) - goto out_noroc_put_cred; roc = layoutreturn; pnfs_init_layoutreturn_args(args, lo, &stateid, iomode); res->lrs_present = 0; layoutreturn = false; - -out_noroc_put_cred: put_cred(lc_cred); + out_noroc: spin_unlock(&ino->i_lock); rcu_read_unlock(); diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index e7ddbce48321..679767ac258d 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -536,7 +536,8 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, nfs_init_commit(data, NULL, NULL, cinfo); nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(data->inode), - data->mds_ops, how, 0); + data->mds_ops, how, + RPC_TASK_CRED_NOREF); } else { nfs_init_commit(data, NULL, data->lseg, cinfo); initiate_commit(data, how); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 59ef3b13ccca..7a70287f21a2 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -185,7 +185,7 @@ static int __nfs_list_for_each_server(struct list_head *head, rcu_read_lock(); list_for_each_entry_rcu(server, head, client_link) { - if (!nfs_sb_active(server->super)) + if (!(server->super && nfs_sb_active(server->super))) continue; rcu_read_unlock(); if (last) @@ -1189,7 +1189,6 @@ static void nfs_get_cache_cookie(struct super_block *sb, uniq = ctx->fscache_uniq; ulen = strlen(ctx->fscache_uniq); } - return; } nfs_fscache_get_super_cookie(sb, uniq, ulen); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index df4b87c30ac9..1e767f779c49 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1695,7 +1695,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, .callback_ops = call_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | flags, + .flags = RPC_TASK_ASYNC | flags, .priority = priority, }; /* Set up the initial task struct. */ @@ -1813,7 +1813,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, nfs_init_commit(data, head, NULL, cinfo); atomic_inc(&cinfo->mds->rpcs_out); return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode), - data->mds_ops, how, 0); + data->mds_ops, how, RPC_TASK_CRED_NOREF); } /* diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c3b11a715082..5cf91322de0f 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1312,6 +1312,7 @@ nfsd4_run_cb_work(struct work_struct *work) container_of(work, struct nfsd4_callback, cb_work); struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *clnt; + int flags; if (cb->cb_need_restart) { cb->cb_need_restart = false; @@ -1340,7 +1341,8 @@ nfsd4_run_cb_work(struct work_struct *work) } cb->cb_msg.rpc_cred = clp->cl_cb_cred; - rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, + flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN; + rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags, cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb); } diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index a8fb18609146..9e40dfecf1b1 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -127,16 +127,8 @@ nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname) goto out; } - { - SHASH_DESC_ON_STACK(desc, tfm); - - desc->tfm = tfm; - - status = crypto_shash_digest(desc, clname->data, clname->len, - cksum.data); - shash_desc_zero(desc); - } - + status = crypto_shash_tfm_digest(tfm, clname->data, clname->len, + cksum.data); if (status) goto out; @@ -1148,7 +1140,6 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) struct crypto_shash *tfm = cn->cn_tfm; struct xdr_netobj cksum; char *principal = NULL; - SHASH_DESC_ON_STACK(desc, tfm); /* Don't upcall if it's already stored */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) @@ -1170,16 +1161,14 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) else if (clp->cl_cred.cr_principal) principal = clp->cl_cred.cr_principal; if (principal) { - desc->tfm = tfm; cksum.len = crypto_shash_digestsize(tfm); cksum.data = kmalloc(cksum.len, GFP_KERNEL); if (cksum.data == NULL) { ret = -ENOMEM; goto out; } - ret = crypto_shash_digest(desc, principal, strlen(principal), - cksum.data); - shash_desc_zero(desc); + ret = crypto_shash_tfm_digest(tfm, principal, strlen(principal), + cksum.data); if (ret) { kfree(cksum.data); goto out; @@ -1343,7 +1332,6 @@ nfsd4_cld_check_v2(struct nfs4_client *clp) struct crypto_shash *tfm = cn->cn_tfm; struct xdr_netobj cksum; char *principal = NULL; - SHASH_DESC_ON_STACK(desc, tfm); /* did we already find that this client is stable? */ if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) @@ -1381,14 +1369,12 @@ found: principal = clp->cl_cred.cr_principal; if (principal == NULL) return -ENOENT; - desc->tfm = tfm; cksum.len = crypto_shash_digestsize(tfm); cksum.data = kmalloc(cksum.len, GFP_KERNEL); if (cksum.data == NULL) return -ENOENT; - status = crypto_shash_digest(desc, principal, strlen(principal), - cksum.data); - shash_desc_zero(desc); + status = crypto_shash_tfm_digest(tfm, principal, + strlen(principal), cksum.data); if (status) { kfree(cksum.data); return -ENOENT; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e32ecedece0f..c107caa56525 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -267,6 +267,8 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, if (!nbl) { nbl= kmalloc(sizeof(*nbl), GFP_KERNEL); if (nbl) { + INIT_LIST_HEAD(&nbl->nbl_list); + INIT_LIST_HEAD(&nbl->nbl_lru); fh_copy_shallow(&nbl->nbl_fh, fh); locks_init_lock(&nbl->nbl_lock); nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client, diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 5435a40f82be..c18459cea6f4 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -520,7 +520,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC); BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 20); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19); mask = fanotify_group_event_mask(group, iter_info, mask, data, data_type); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 8e4f1ace467c..1de77f1a600b 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -275,7 +275,6 @@ static ssize_t dlmfs_file_write(struct file *filp, loff_t *ppos) { int bytes_left; - ssize_t writelen; char *lvb_buf; struct inode *inode = file_inode(filp); @@ -285,32 +284,30 @@ static ssize_t dlmfs_file_write(struct file *filp, if (*ppos >= i_size_read(inode)) return -ENOSPC; + /* don't write past the lvb */ + if (count > i_size_read(inode) - *ppos) + count = i_size_read(inode) - *ppos; + if (!count) return 0; if (!access_ok(buf, count)) return -EFAULT; - /* don't write past the lvb */ - if ((count + *ppos) > i_size_read(inode)) - writelen = i_size_read(inode) - *ppos; - else - writelen = count - *ppos; - - lvb_buf = kmalloc(writelen, GFP_NOFS); + lvb_buf = kmalloc(count, GFP_NOFS); if (!lvb_buf) return -ENOMEM; - bytes_left = copy_from_user(lvb_buf, buf, writelen); - writelen -= bytes_left; - if (writelen) - user_dlm_write_lvb(inode, lvb_buf, writelen); + bytes_left = copy_from_user(lvb_buf, buf, count); + count -= bytes_left; + if (count) + user_dlm_write_lvb(inode, lvb_buf, count); kfree(lvb_buf); - *ppos = *ppos + writelen; - mlog(0, "wrote %zd bytes\n", writelen); - return writelen; + *ppos = *ppos + count; + mlog(0, "wrote %zu bytes\n", count); + return count; } static void dlmfs_init_once(void *foo) diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 475c61f53f0f..ed5c1078919c 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -783,6 +783,9 @@ static struct ovl_fh *ovl_fid_to_fh(struct fid *fid, int buflen, int fh_type) if (fh_type != OVL_FILEID_V0) return ERR_PTR(-EINVAL); + if (buflen <= OVL_FH_WIRE_OFFSET) + return ERR_PTR(-EINVAL); + fh = kzalloc(buflen, GFP_KERNEL); if (!fh) return ERR_PTR(-ENOMEM); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index b0d42ece4d7c..981f11ec51bc 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -58,6 +58,24 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) attr->ia_valid &= ~ATTR_MODE; + /* + * We might have to translate ovl file into real file object + * once use cases emerge. For now, simply don't let underlying + * filesystem rely on attr->ia_file + */ + attr->ia_valid &= ~ATTR_FILE; + + /* + * If open(O_TRUNC) is done, VFS calls ->setattr with ATTR_OPEN + * set. Overlayfs does not pass O_TRUNC flag to underlying + * filesystem during open -> do not pass ATTR_OPEN. This + * disables optimization in fuse which assumes open(O_TRUNC) + * already set file size to 0. But we never passed O_TRUNC to + * fuse. So by clearing ATTR_OPEN, fuse will be forced to send + * setattr request to server. + */ + attr->ia_valid &= ~ATTR_OPEN; + inode_lock(upperdentry->d_inode); old_cred = ovl_override_creds(dentry->d_sb); err = notify_change(upperdentry, attr, NULL); diff --git a/fs/pnode.c b/fs/pnode.c index 49f6d7ff2139..1106137c747a 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -261,14 +261,13 @@ static int propagate_one(struct mount *m) child = copy_tree(last_source, last_source->mnt.mnt_root, type); if (IS_ERR(child)) return PTR_ERR(child); + read_seqlock_excl(&mount_lock); mnt_set_mountpoint(m, mp, child); + if (m->mnt_master != dest_master) + SET_MNT_MARK(m->mnt_master); + read_sequnlock_excl(&mount_lock); last_dest = m; last_source = child; - if (m->mnt_master != dest_master) { - read_seqlock_excl(&mount_lock); - SET_MNT_MARK(m->mnt_master); - read_sequnlock_excl(&mount_lock); - } hlist_add_head(&child->mnt_hash, list); return count_mounts(m->mnt_ns, child); } diff --git a/fs/proc/base.c b/fs/proc/base.c index 572898dd16a0..eb2255e95f62 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3286,7 +3286,6 @@ static const struct inode_operations proc_tgid_base_inode_operations = { void proc_flush_pid(struct pid *pid) { proc_invalidate_siblings_dcache(&pid->inodes, &pid->lock); - put_pid(pid); } static struct dentry *proc_pid_instantiate(struct dentry * dentry, diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 7dc800cce354..c663202da8de 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -266,7 +266,8 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst, if (start < offset + dump->size) { tsz = min(offset + (u64)dump->size - start, (u64)size); buf = dump->buf + start - offset; - if (remap_vmalloc_range_partial(vma, dst, buf, tsz)) { + if (remap_vmalloc_range_partial(vma, dst, buf, 0, + tsz)) { ret = -EFAULT; goto out_unlock; } @@ -624,7 +625,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size); kaddr = elfnotes_buf + start - elfcorebuf_sz - vmcoredd_orig_sz; if (remap_vmalloc_range_partial(vma, vma->vm_start + len, - kaddr, tsz)) + kaddr, 0, tsz)) goto fail; size -= tsz; diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index 8f0369aad22a..e16a49ebfe54 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -153,3 +153,112 @@ config PSTORE_RAM "ramoops.ko". For more information, see Documentation/admin-guide/ramoops.rst. + +config PSTORE_ZONE + tristate + depends on PSTORE + help + The common layer for pstore/blk (and pstore/ram in the future) + to manage storage in zones. + +config PSTORE_BLK + tristate "Log panic/oops to a block device" + depends on PSTORE + depends on BLOCK + select PSTORE_ZONE + default n + help + This enables panic and oops message to be logged to a block dev + where it can be read back at some later point. + + For more information, see Documentation/admin-guide/pstore-blk.rst + + If unsure, say N. + +config PSTORE_BLK_BLKDEV + string "block device identifier" + depends on PSTORE_BLK + default "" + help + Which block device should be used for pstore/blk. + + It accepts the following variants: + 1) <hex_major><hex_minor> device number in hexadecimal representation, + with no leading 0x, for example b302. + 2) /dev/<disk_name> represents the device name of disk + 3) /dev/<disk_name><decimal> represents the device name and number + of partition - device number of disk plus the partition number + 4) /dev/<disk_name>p<decimal> - same as the above, this form is + used when disk name of partitioned disk ends with a digit. + 5) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the + unique id of a partition if the partition table provides it. + The UUID may be either an EFI/GPT UUID, or refer to an MSDOS + partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- + filled hex representation of the 32-bit "NT disk signature", and PP + is a zero-filled hex representation of the 1-based partition number. + 6) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation + to a partition with a known unique id. + 7) <major>:<minor> major and minor number of the device separated by + a colon. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_KMSG_SIZE + int "Size in Kbytes of kmsg dump log to store" + depends on PSTORE_BLK + default 64 + help + This just sets size of kmsg dump (oops, panic, etc) log for + pstore/blk. The size is in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_MAX_REASON + int "Maximum kmsg dump reason to store" + depends on PSTORE_BLK + default 2 + help + The maximum reason for kmsg dumps to store. The default is + 2 (KMSG_DUMP_OOPS), see include/linux/kmsg_dump.h's + enum kmsg_dump_reason for more details. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_PMSG_SIZE + int "Size in Kbytes of pmsg to store" + depends on PSTORE_BLK + depends on PSTORE_PMSG + default 64 + help + This just sets size of pmsg (pmsg_size) for pstore/blk. The size is + in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_CONSOLE_SIZE + int "Size in Kbytes of console log to store" + depends on PSTORE_BLK + depends on PSTORE_CONSOLE + default 64 + help + This just sets size of console log (console_size) to store via + pstore/blk. The size is in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. + +config PSTORE_BLK_FTRACE_SIZE + int "Size in Kbytes of ftrace log to store" + depends on PSTORE_BLK + depends on PSTORE_FTRACE + default 64 + help + This just sets size of ftrace log (ftrace_size) for pstore/blk. The + size is in KB and must be a multiple of 4. + + NOTE that, both Kconfig and module parameters can configure + pstore/blk, but module parameters have priority over Kconfig. diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile index 967b5891f325..c270467aeece 100644 --- a/fs/pstore/Makefile +++ b/fs/pstore/Makefile @@ -12,3 +12,9 @@ pstore-$(CONFIG_PSTORE_PMSG) += pmsg.o ramoops-objs += ram.o ram_core.o obj-$(CONFIG_PSTORE_RAM) += ramoops.o + +pstore_zone-objs += zone.o +obj-$(CONFIG_PSTORE_ZONE) += pstore_zone.o + +pstore_blk-objs += blk.o +obj-$(CONFIG_PSTORE_BLK) += pstore_blk.o diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c new file mode 100644 index 000000000000..fcd5563dde06 --- /dev/null +++ b/fs/pstore/blk.c @@ -0,0 +1,517 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implements pstore backend driver that write to block (or non-block) storage + * devices, using the pstore/zone API. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include "../../block/blk.h" +#include <linux/blkdev.h> +#include <linux/string.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/platform_device.h> +#include <linux/pstore_blk.h> +#include <linux/mount.h> +#include <linux/uio.h> + +static long kmsg_size = CONFIG_PSTORE_BLK_KMSG_SIZE; +module_param(kmsg_size, long, 0400); +MODULE_PARM_DESC(kmsg_size, "kmsg dump record size in kbytes"); + +static int max_reason = CONFIG_PSTORE_BLK_MAX_REASON; +module_param(max_reason, int, 0400); +MODULE_PARM_DESC(max_reason, + "maximum reason for kmsg dump (default 2: Oops and Panic)"); + +#if IS_ENABLED(CONFIG_PSTORE_PMSG) +static long pmsg_size = CONFIG_PSTORE_BLK_PMSG_SIZE; +#else +static long pmsg_size = -1; +#endif +module_param(pmsg_size, long, 0400); +MODULE_PARM_DESC(pmsg_size, "pmsg size in kbytes"); + +#if IS_ENABLED(CONFIG_PSTORE_CONSOLE) +static long console_size = CONFIG_PSTORE_BLK_CONSOLE_SIZE; +#else +static long console_size = -1; +#endif +module_param(console_size, long, 0400); +MODULE_PARM_DESC(console_size, "console size in kbytes"); + +#if IS_ENABLED(CONFIG_PSTORE_FTRACE) +static long ftrace_size = CONFIG_PSTORE_BLK_FTRACE_SIZE; +#else +static long ftrace_size = -1; +#endif +module_param(ftrace_size, long, 0400); +MODULE_PARM_DESC(ftrace_size, "ftrace size in kbytes"); + +static bool best_effort; +module_param(best_effort, bool, 0400); +MODULE_PARM_DESC(best_effort, "use best effort to write (i.e. do not require storage driver pstore support, default: off)"); + +/* + * blkdev - the block device to use for pstore storage + * + * Usually, this will be a partition of a block device. + * + * blkdev accepts the following variants: + * 1) <hex_major><hex_minor> device number in hexadecimal representation, + * with no leading 0x, for example b302. + * 2) /dev/<disk_name> represents the device number of disk + * 3) /dev/<disk_name><decimal> represents the device number + * of partition - device number of disk plus the partition number + * 4) /dev/<disk_name>p<decimal> - same as the above, that form is + * used when disk name of partitioned disk ends on a digit. + * 5) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the + * unique id of a partition if the partition table provides it. + * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS + * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- + * filled hex representation of the 32-bit "NT disk signature", and PP + * is a zero-filled hex representation of the 1-based partition number. + * 6) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to + * a partition with a known unique id. + * 7) <major>:<minor> major and minor number of the device separated by + * a colon. + */ +static char blkdev[80] = CONFIG_PSTORE_BLK_BLKDEV; +module_param_string(blkdev, blkdev, 80, 0400); +MODULE_PARM_DESC(blkdev, "block device for pstore storage"); + +/* + * All globals must only be accessed under the pstore_blk_lock + * during the register/unregister functions. + */ +static DEFINE_MUTEX(pstore_blk_lock); +static struct block_device *psblk_bdev; +static struct pstore_zone_info *pstore_zone_info; +static pstore_blk_panic_write_op blkdev_panic_write; + +struct bdev_info { + dev_t devt; + sector_t nr_sects; + sector_t start_sect; +}; + +#define check_size(name, alignsize) ({ \ + long _##name_ = (name); \ + _##name_ = _##name_ <= 0 ? 0 : (_##name_ * 1024); \ + if (_##name_ & ((alignsize) - 1)) { \ + pr_info(#name " must align to %d\n", \ + (alignsize)); \ + _##name_ = ALIGN(name, (alignsize)); \ + } \ + _##name_; \ +}) + +static int __register_pstore_device(struct pstore_device_info *dev) +{ + int ret; + + lockdep_assert_held(&pstore_blk_lock); + + if (!dev || !dev->total_size || !dev->read || !dev->write) + return -EINVAL; + + /* someone already registered before */ + if (pstore_zone_info) + return -EBUSY; + + pstore_zone_info = kzalloc(sizeof(struct pstore_zone_info), GFP_KERNEL); + if (!pstore_zone_info) + return -ENOMEM; + + /* zero means not limit on which backends to attempt to store. */ + if (!dev->flags) + dev->flags = UINT_MAX; + +#define verify_size(name, alignsize, enabled) { \ + long _##name_; \ + if (enabled) \ + _##name_ = check_size(name, alignsize); \ + else \ + _##name_ = 0; \ + name = _##name_ / 1024; \ + pstore_zone_info->name = _##name_; \ + } + + verify_size(kmsg_size, 4096, dev->flags & PSTORE_FLAGS_DMESG); + verify_size(pmsg_size, 4096, dev->flags & PSTORE_FLAGS_PMSG); + verify_size(console_size, 4096, dev->flags & PSTORE_FLAGS_CONSOLE); + verify_size(ftrace_size, 4096, dev->flags & PSTORE_FLAGS_FTRACE); +#undef verify_size + + pstore_zone_info->total_size = dev->total_size; + pstore_zone_info->max_reason = max_reason; + pstore_zone_info->read = dev->read; + pstore_zone_info->write = dev->write; + pstore_zone_info->erase = dev->erase; + pstore_zone_info->panic_write = dev->panic_write; + pstore_zone_info->name = KBUILD_MODNAME; + pstore_zone_info->owner = THIS_MODULE; + + ret = register_pstore_zone(pstore_zone_info); + if (ret) { + kfree(pstore_zone_info); + pstore_zone_info = NULL; + } + return ret; +} +/** + * register_pstore_device() - register non-block device to pstore/blk + * + * @dev: non-block device information + * + * Return: + * * 0 - OK + * * Others - something error. + */ +int register_pstore_device(struct pstore_device_info *dev) +{ + int ret; + + mutex_lock(&pstore_blk_lock); + ret = __register_pstore_device(dev); + mutex_unlock(&pstore_blk_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(register_pstore_device); + +static void __unregister_pstore_device(struct pstore_device_info *dev) +{ + lockdep_assert_held(&pstore_blk_lock); + if (pstore_zone_info && pstore_zone_info->read == dev->read) { + unregister_pstore_zone(pstore_zone_info); + kfree(pstore_zone_info); + pstore_zone_info = NULL; + } +} + +/** + * unregister_pstore_device() - unregister non-block device from pstore/blk + * + * @dev: non-block device information + */ +void unregister_pstore_device(struct pstore_device_info *dev) +{ + mutex_lock(&pstore_blk_lock); + __unregister_pstore_device(dev); + mutex_unlock(&pstore_blk_lock); +} +EXPORT_SYMBOL_GPL(unregister_pstore_device); + +/** + * psblk_get_bdev() - open block device + * + * @holder: Exclusive holder identifier + * @info: Information about bdev to fill in + * + * Return: pointer to block device on success and others on error. + * + * On success, the returned block_device has reference count of one. + */ +static struct block_device *psblk_get_bdev(void *holder, + struct bdev_info *info) +{ + struct block_device *bdev = ERR_PTR(-ENODEV); + fmode_t mode = FMODE_READ | FMODE_WRITE; + sector_t nr_sects; + + lockdep_assert_held(&pstore_blk_lock); + + if (pstore_zone_info) + return ERR_PTR(-EBUSY); + + if (!blkdev[0]) + return ERR_PTR(-ENODEV); + + if (holder) + mode |= FMODE_EXCL; + bdev = blkdev_get_by_path(blkdev, mode, holder); + if (IS_ERR(bdev)) { + dev_t devt; + + devt = name_to_dev_t(blkdev); + if (devt == 0) + return ERR_PTR(-ENODEV); + bdev = blkdev_get_by_dev(devt, mode, holder); + if (IS_ERR(bdev)) + return bdev; + } + + nr_sects = part_nr_sects_read(bdev->bd_part); + if (!nr_sects) { + pr_err("not enough space for '%s'\n", blkdev); + blkdev_put(bdev, mode); + return ERR_PTR(-ENOSPC); + } + + if (info) { + info->devt = bdev->bd_dev; + info->nr_sects = nr_sects; + info->start_sect = get_start_sect(bdev); + } + + return bdev; +} + +static void psblk_put_bdev(struct block_device *bdev, void *holder) +{ + fmode_t mode = FMODE_READ | FMODE_WRITE; + + lockdep_assert_held(&pstore_blk_lock); + + if (!bdev) + return; + + if (holder) + mode |= FMODE_EXCL; + blkdev_put(bdev, mode); +} + +static ssize_t psblk_generic_blk_read(char *buf, size_t bytes, loff_t pos) +{ + struct block_device *bdev = psblk_bdev; + struct file file; + struct kiocb kiocb; + struct iov_iter iter; + struct kvec iov = {.iov_base = buf, .iov_len = bytes}; + + if (!bdev) + return -ENODEV; + + memset(&file, 0, sizeof(struct file)); + file.f_mapping = bdev->bd_inode->i_mapping; + file.f_flags = O_DSYNC | __O_SYNC | O_NOATIME; + file.f_inode = bdev->bd_inode; + file_ra_state_init(&file.f_ra, file.f_mapping); + + init_sync_kiocb(&kiocb, &file); + kiocb.ki_pos = pos; + iov_iter_kvec(&iter, READ, &iov, 1, bytes); + + return generic_file_read_iter(&kiocb, &iter); +} + +static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes, + loff_t pos) +{ + struct block_device *bdev = psblk_bdev; + struct iov_iter iter; + struct kiocb kiocb; + struct file file; + ssize_t ret; + struct kvec iov = {.iov_base = (void *)buf, .iov_len = bytes}; + + if (!bdev) + return -ENODEV; + + /* Console/Ftrace backend may handle buffer until flush dirty zones */ + if (in_interrupt() || irqs_disabled()) + return -EBUSY; + + memset(&file, 0, sizeof(struct file)); + file.f_mapping = bdev->bd_inode->i_mapping; + file.f_flags = O_DSYNC | __O_SYNC | O_NOATIME; + file.f_inode = bdev->bd_inode; + + init_sync_kiocb(&kiocb, &file); + kiocb.ki_pos = pos; + iov_iter_kvec(&iter, WRITE, &iov, 1, bytes); + + inode_lock(bdev->bd_inode); + ret = generic_write_checks(&kiocb, &iter); + if (ret > 0) + ret = generic_perform_write(&file, &iter, pos); + inode_unlock(bdev->bd_inode); + + if (likely(ret > 0)) { + const struct file_operations f_op = {.fsync = blkdev_fsync}; + + file.f_op = &f_op; + kiocb.ki_pos += ret; + ret = generic_write_sync(&kiocb, ret); + } + return ret; +} + +static ssize_t psblk_blk_panic_write(const char *buf, size_t size, + loff_t off) +{ + int ret; + + if (!blkdev_panic_write) + return -EOPNOTSUPP; + + /* size and off must align to SECTOR_SIZE for block device */ + ret = blkdev_panic_write(buf, off >> SECTOR_SHIFT, + size >> SECTOR_SHIFT); + /* try next zone */ + if (ret == -ENOMSG) + return ret; + return ret ? -EIO : size; +} + +static int __register_pstore_blk(struct pstore_blk_info *info) +{ + char bdev_name[BDEVNAME_SIZE]; + struct block_device *bdev; + struct pstore_device_info dev; + struct bdev_info binfo; + void *holder = blkdev; + int ret = -ENODEV; + + lockdep_assert_held(&pstore_blk_lock); + + /* hold bdev exclusively */ + memset(&binfo, 0, sizeof(binfo)); + bdev = psblk_get_bdev(holder, &binfo); + if (IS_ERR(bdev)) { + pr_err("failed to open '%s'!\n", blkdev); + return PTR_ERR(bdev); + } + + /* only allow driver matching the @blkdev */ + if (!binfo.devt || (!best_effort && + MAJOR(binfo.devt) != info->major)) { + pr_debug("invalid major %u (expect %u)\n", + info->major, MAJOR(binfo.devt)); + ret = -ENODEV; + goto err_put_bdev; + } + + /* psblk_bdev must be assigned before register to pstore/blk */ + psblk_bdev = bdev; + blkdev_panic_write = info->panic_write; + + /* Copy back block device details. */ + info->devt = binfo.devt; + info->nr_sects = binfo.nr_sects; + info->start_sect = binfo.start_sect; + + memset(&dev, 0, sizeof(dev)); + dev.total_size = info->nr_sects << SECTOR_SHIFT; + dev.flags = info->flags; + dev.read = psblk_generic_blk_read; + dev.write = psblk_generic_blk_write; + dev.erase = NULL; + dev.panic_write = info->panic_write ? psblk_blk_panic_write : NULL; + + ret = __register_pstore_device(&dev); + if (ret) + goto err_put_bdev; + + bdevname(bdev, bdev_name); + pr_info("attached %s%s\n", bdev_name, + info->panic_write ? "" : " (no dedicated panic_write!)"); + return 0; + +err_put_bdev: + psblk_bdev = NULL; + blkdev_panic_write = NULL; + psblk_put_bdev(bdev, holder); + return ret; +} + +/** + * register_pstore_blk() - register block device to pstore/blk + * + * @info: details on the desired block device interface + * + * Return: + * * 0 - OK + * * Others - something error. + */ +int register_pstore_blk(struct pstore_blk_info *info) +{ + int ret; + + mutex_lock(&pstore_blk_lock); + ret = __register_pstore_blk(info); + mutex_unlock(&pstore_blk_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(register_pstore_blk); + +static void __unregister_pstore_blk(unsigned int major) +{ + struct pstore_device_info dev = { .read = psblk_generic_blk_read }; + void *holder = blkdev; + + lockdep_assert_held(&pstore_blk_lock); + if (psblk_bdev && MAJOR(psblk_bdev->bd_dev) == major) { + __unregister_pstore_device(&dev); + psblk_put_bdev(psblk_bdev, holder); + blkdev_panic_write = NULL; + psblk_bdev = NULL; + } +} + +/** + * unregister_pstore_blk() - unregister block device from pstore/blk + * + * @major: the major device number of device + */ +void unregister_pstore_blk(unsigned int major) +{ + mutex_lock(&pstore_blk_lock); + __unregister_pstore_blk(major); + mutex_unlock(&pstore_blk_lock); +} +EXPORT_SYMBOL_GPL(unregister_pstore_blk); + +/* get information of pstore/blk */ +int pstore_blk_get_config(struct pstore_blk_config *info) +{ + strncpy(info->device, blkdev, 80); + info->max_reason = max_reason; + info->kmsg_size = check_size(kmsg_size, 4096); + info->pmsg_size = check_size(pmsg_size, 4096); + info->ftrace_size = check_size(ftrace_size, 4096); + info->console_size = check_size(console_size, 4096); + + return 0; +} +EXPORT_SYMBOL_GPL(pstore_blk_get_config); + +static int __init pstore_blk_init(void) +{ + struct pstore_blk_info info = { }; + int ret = 0; + + mutex_lock(&pstore_blk_lock); + if (!pstore_zone_info && best_effort && blkdev[0]) + ret = __register_pstore_blk(&info); + mutex_unlock(&pstore_blk_lock); + + return ret; +} +late_initcall(pstore_blk_init); + +static void __exit pstore_blk_exit(void) +{ + mutex_lock(&pstore_blk_lock); + if (psblk_bdev) + __unregister_pstore_blk(MAJOR(psblk_bdev->bd_dev)); + else { + struct pstore_device_info dev = { }; + + if (pstore_zone_info) + dev.read = pstore_zone_info->read; + __unregister_pstore_device(&dev); + } + mutex_unlock(&pstore_blk_lock); +} +module_exit(pstore_blk_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("WeiXiong Liao <liaoweixiong@allwinnertech.com>"); +MODULE_AUTHOR("Kees Cook <keescook@chromium.org>"); +MODULE_DESCRIPTION("pstore backend for block devices"); diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index bfbfc2698070..5c0450701293 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -16,6 +16,7 @@ #include <linux/debugfs.h> #include <linux/err.h> #include <linux/cache.h> +#include <linux/slab.h> #include <asm/barrier.h> #include "internal.h" @@ -132,3 +133,56 @@ void pstore_unregister_ftrace(void) debugfs_remove_recursive(pstore_ftrace_dir); } + +ssize_t pstore_ftrace_combine_log(char **dest_log, size_t *dest_log_size, + const char *src_log, size_t src_log_size) +{ + size_t dest_size, src_size, total, dest_off, src_off; + size_t dest_idx = 0, src_idx = 0, merged_idx = 0; + void *merged_buf; + struct pstore_ftrace_record *drec, *srec, *mrec; + size_t record_size = sizeof(struct pstore_ftrace_record); + + dest_off = *dest_log_size % record_size; + dest_size = *dest_log_size - dest_off; + + src_off = src_log_size % record_size; + src_size = src_log_size - src_off; + + total = dest_size + src_size; + merged_buf = kmalloc(total, GFP_KERNEL); + if (!merged_buf) + return -ENOMEM; + + drec = (struct pstore_ftrace_record *)(*dest_log + dest_off); + srec = (struct pstore_ftrace_record *)(src_log + src_off); + mrec = (struct pstore_ftrace_record *)(merged_buf); + + while (dest_size > 0 && src_size > 0) { + if (pstore_ftrace_read_timestamp(&drec[dest_idx]) < + pstore_ftrace_read_timestamp(&srec[src_idx])) { + mrec[merged_idx++] = drec[dest_idx++]; + dest_size -= record_size; + } else { + mrec[merged_idx++] = srec[src_idx++]; + src_size -= record_size; + } + } + + while (dest_size > 0) { + mrec[merged_idx++] = drec[dest_idx++]; + dest_size -= record_size; + } + + while (src_size > 0) { + mrec[merged_idx++] = srec[src_idx++]; + src_size -= record_size; + } + + kfree(*dest_log); + *dest_log = merged_buf; + *dest_log_size = total; + + return 0; +} +EXPORT_SYMBOL_GPL(pstore_ftrace_combine_log); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index d99b5d39aa90..c331efe8de95 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -22,18 +22,21 @@ #include <linux/magic.h> #include <linux/pstore.h> #include <linux/slab.h> -#include <linux/spinlock.h> #include <linux/uaccess.h> #include "internal.h" #define PSTORE_NAMELEN 64 -static DEFINE_SPINLOCK(allpstore_lock); -static LIST_HEAD(allpstore); +static DEFINE_MUTEX(records_list_lock); +static LIST_HEAD(records_list); + +static DEFINE_MUTEX(pstore_sb_lock); +static struct super_block *pstore_sb; struct pstore_private { struct list_head list; + struct dentry *dentry; struct pstore_record *record; size_t total_size; }; @@ -178,10 +181,22 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) { struct pstore_private *p = d_inode(dentry)->i_private; struct pstore_record *record = p->record; + int rc = 0; if (!record->psi->erase) return -EPERM; + /* Make sure we can't race while removing this file. */ + mutex_lock(&records_list_lock); + if (!list_empty(&p->list)) + list_del_init(&p->list); + else + rc = -ENOENT; + p->dentry = NULL; + mutex_unlock(&records_list_lock); + if (rc) + return rc; + mutex_lock(&record->psi->read_mutex); record->psi->erase(record); mutex_unlock(&record->psi->read_mutex); @@ -192,15 +207,9 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) static void pstore_evict_inode(struct inode *inode) { struct pstore_private *p = inode->i_private; - unsigned long flags; clear_inode(inode); - if (p) { - spin_lock_irqsave(&allpstore_lock, flags); - list_del(&p->list); - spin_unlock_irqrestore(&allpstore_lock, flags); - free_pstore_private(p); - } + free_pstore_private(p); } static const struct inode_operations pstore_dir_inode_operations = { @@ -278,11 +287,54 @@ static const struct super_operations pstore_ops = { .show_options = pstore_show_options, }; -static struct super_block *pstore_sb; +static struct dentry *psinfo_lock_root(void) +{ + struct dentry *root; -bool pstore_is_mounted(void) + mutex_lock(&pstore_sb_lock); + /* + * Having no backend is fine -- no records appear. + * Not being mounted is fine -- nothing to do. + */ + if (!psinfo || !pstore_sb) { + mutex_unlock(&pstore_sb_lock); + return NULL; + } + + root = pstore_sb->s_root; + inode_lock(d_inode(root)); + mutex_unlock(&pstore_sb_lock); + + return root; +} + +int pstore_put_backend_records(struct pstore_info *psi) { - return pstore_sb != NULL; + struct pstore_private *pos, *tmp; + struct dentry *root; + int rc = 0; + + root = psinfo_lock_root(); + if (!root) + return 0; + + mutex_lock(&records_list_lock); + list_for_each_entry_safe(pos, tmp, &records_list, list) { + if (pos->record->psi == psi) { + list_del_init(&pos->list); + rc = simple_unlink(d_inode(root), pos->dentry); + if (WARN_ON(rc)) + break; + d_drop(pos->dentry); + dput(pos->dentry); + pos->dentry = NULL; + } + } + mutex_unlock(&records_list_lock); + + inode_unlock(d_inode(root)); + + return rc; } /* @@ -297,23 +349,20 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) int rc = 0; char name[PSTORE_NAMELEN]; struct pstore_private *private, *pos; - unsigned long flags; size_t size = record->size + record->ecc_notice_size; - WARN_ON(!inode_is_locked(d_inode(root))); + if (WARN_ON(!inode_is_locked(d_inode(root)))) + return -EINVAL; - spin_lock_irqsave(&allpstore_lock, flags); - list_for_each_entry(pos, &allpstore, list) { + rc = -EEXIST; + /* Skip records that are already present in the filesystem. */ + mutex_lock(&records_list_lock); + list_for_each_entry(pos, &records_list, list) { if (pos->record->type == record->type && pos->record->id == record->id && - pos->record->psi == record->psi) { - rc = -EEXIST; - break; - } + pos->record->psi == record->psi) + goto fail; } - spin_unlock_irqrestore(&allpstore_lock, flags); - if (rc) - return rc; rc = -ENOMEM; inode = pstore_get_inode(root->d_sb); @@ -334,6 +383,7 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) if (!dentry) goto fail_private; + private->dentry = dentry; private->record = record; inode->i_size = private->total_size = size; inode->i_private = private; @@ -343,9 +393,8 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) d_add(dentry, inode); - spin_lock_irqsave(&allpstore_lock, flags); - list_add(&private->list, &allpstore); - spin_unlock_irqrestore(&allpstore_lock, flags); + list_add(&private->list, &records_list); + mutex_unlock(&records_list_lock); return 0; @@ -353,8 +402,8 @@ fail_private: free_pstore_private(private); fail_inode: iput(inode); - fail: + mutex_unlock(&records_list_lock); return rc; } @@ -366,16 +415,13 @@ fail: */ void pstore_get_records(int quiet) { - struct pstore_info *psi = psinfo; struct dentry *root; - if (!psi || !pstore_sb) + root = psinfo_lock_root(); + if (!root) return; - root = pstore_sb->s_root; - - inode_lock(d_inode(root)); - pstore_get_backend_records(psi, root, quiet); + pstore_get_backend_records(psinfo, root, quiet); inode_unlock(d_inode(root)); } @@ -383,8 +429,6 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; - pstore_sb = sb; - sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; @@ -405,6 +449,10 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) if (!sb->s_root) return -ENOMEM; + mutex_lock(&pstore_sb_lock); + pstore_sb = sb; + mutex_unlock(&pstore_sb_lock); + pstore_get_records(0); return 0; @@ -418,8 +466,17 @@ static struct dentry *pstore_mount(struct file_system_type *fs_type, static void pstore_kill_sb(struct super_block *sb) { + mutex_lock(&pstore_sb_lock); + WARN_ON(pstore_sb != sb); + kill_litter_super(sb); pstore_sb = NULL; + + mutex_lock(&records_list_lock); + INIT_LIST_HEAD(&records_list); + mutex_unlock(&records_list_lock); + + mutex_unlock(&pstore_sb_lock); } static struct file_system_type pstore_fs_type = { diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 7062ea4bc57c..7fb219042f13 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -12,9 +12,18 @@ extern unsigned long kmsg_bytes; #ifdef CONFIG_PSTORE_FTRACE extern void pstore_register_ftrace(void); extern void pstore_unregister_ftrace(void); +ssize_t pstore_ftrace_combine_log(char **dest_log, size_t *dest_log_size, + const char *src_log, size_t src_log_size); #else static inline void pstore_register_ftrace(void) {} static inline void pstore_unregister_ftrace(void) {} +static inline ssize_t +pstore_ftrace_combine_log(char **dest_log, size_t *dest_log_size, + const char *src_log, size_t src_log_size) +{ + *dest_log_size = 0; + return 0; +} #endif #ifdef CONFIG_PSTORE_PMSG @@ -31,9 +40,9 @@ extern void pstore_set_kmsg_bytes(int); extern void pstore_get_records(int); extern void pstore_get_backend_records(struct pstore_info *psi, struct dentry *root, int quiet); +extern int pstore_put_backend_records(struct pstore_info *psi); extern int pstore_mkfile(struct dentry *root, struct pstore_record *record); -extern bool pstore_is_mounted(void); extern void pstore_record_init(struct pstore_record *record, struct pstore_info *psi); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 408277ee3cdb..a9e297eefdff 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -44,7 +44,7 @@ static int pstore_update_ms = -1; module_param_named(update_ms, pstore_update_ms, int, 0600); MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content " "(default is -1, which means runtime updates are disabled; " - "enabling this option is not safe, it may lead to further " + "enabling this option may not be safe; it may lead to further " "corruption on Oopses)"); /* Names should be in the same order as the enum pstore_type_id */ @@ -69,19 +69,25 @@ static void pstore_dowork(struct work_struct *); static DECLARE_WORK(pstore_work, pstore_dowork); /* - * pstore_lock just protects "psinfo" during - * calls to pstore_register() + * psinfo_lock protects "psinfo" during calls to + * pstore_register(), pstore_unregister(), and + * the filesystem mount/unmount routines. */ -static DEFINE_SPINLOCK(pstore_lock); +static DEFINE_MUTEX(psinfo_lock); struct pstore_info *psinfo; static char *backend; +module_param(backend, charp, 0444); +MODULE_PARM_DESC(backend, "specific backend to use"); + static char *compress = #ifdef CONFIG_PSTORE_COMPRESS_DEFAULT CONFIG_PSTORE_COMPRESS_DEFAULT; #else NULL; #endif +module_param(compress, charp, 0444); +MODULE_PARM_DESC(compress, "compression to use"); /* Compression parameters */ static struct crypto_comp *tfm; @@ -129,24 +135,12 @@ enum pstore_type_id pstore_name_to_type(const char *name) } EXPORT_SYMBOL_GPL(pstore_name_to_type); -static const char *get_reason_str(enum kmsg_dump_reason reason) +static void pstore_timer_kick(void) { - switch (reason) { - case KMSG_DUMP_PANIC: - return "Panic"; - case KMSG_DUMP_OOPS: - return "Oops"; - case KMSG_DUMP_EMERG: - return "Emergency"; - case KMSG_DUMP_RESTART: - return "Restart"; - case KMSG_DUMP_HALT: - return "Halt"; - case KMSG_DUMP_POWEROFF: - return "Poweroff"; - default: - return "Unknown"; - } + if (pstore_update_ms < 0) + return; + + mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms)); } /* @@ -393,7 +387,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned int part = 1; int ret; - why = get_reason_str(reason); + why = kmsg_dump_reason_str(reason); if (down_trylock(&psinfo->buf_lock)) { /* Failed to acquire lock: give up if we cannot wait. */ @@ -459,8 +453,10 @@ static void pstore_dump(struct kmsg_dumper *dumper, } ret = psinfo->write(&record); - if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) + if (ret == 0 && reason == KMSG_DUMP_OOPS) { pstore_new_entry = 1; + pstore_timer_kick(); + } total += record.size; part++; @@ -503,14 +499,20 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) } static struct console pstore_console = { - .name = "pstore", .write = pstore_console_write, - .flags = CON_PRINTBUFFER | CON_ENABLED | CON_ANYTIME, .index = -1, }; static void pstore_register_console(void) { + /* Show which backend is going to get console writes. */ + strscpy(pstore_console.name, psinfo->name, + sizeof(pstore_console.name)); + /* + * Always initialize flags here since prior unregister_console() + * calls may have changed settings (specifically CON_ENABLED). + */ + pstore_console.flags = CON_PRINTBUFFER | CON_ENABLED | CON_ANYTIME; register_console(&pstore_console); } @@ -555,8 +557,6 @@ out: */ int pstore_register(struct pstore_info *psi) { - struct module *owner = psi->owner; - if (backend && strcmp(backend, psi->name)) { pr_warn("ignoring unexpected backend '%s'\n", psi->name); return -EPERM; @@ -576,11 +576,11 @@ int pstore_register(struct pstore_info *psi) return -EINVAL; } - spin_lock(&pstore_lock); + mutex_lock(&psinfo_lock); if (psinfo) { pr_warn("backend '%s' already loaded: ignoring '%s'\n", psinfo->name, psi->name); - spin_unlock(&pstore_lock); + mutex_unlock(&psinfo_lock); return -EBUSY; } @@ -589,21 +589,16 @@ int pstore_register(struct pstore_info *psi) psinfo = psi; mutex_init(&psinfo->read_mutex); sema_init(&psinfo->buf_lock, 1); - spin_unlock(&pstore_lock); - - if (owner && !try_module_get(owner)) { - psinfo = NULL; - return -EINVAL; - } if (psi->flags & PSTORE_FLAGS_DMESG) allocate_buf_for_compression(); - if (pstore_is_mounted()) - pstore_get_records(0); + pstore_get_records(0); - if (psi->flags & PSTORE_FLAGS_DMESG) + if (psi->flags & PSTORE_FLAGS_DMESG) { + pstore_dumper.max_reason = psinfo->max_reason; pstore_register_kmsg(); + } if (psi->flags & PSTORE_FLAGS_CONSOLE) pstore_register_console(); if (psi->flags & PSTORE_FLAGS_FTRACE) @@ -612,33 +607,36 @@ int pstore_register(struct pstore_info *psi) pstore_register_pmsg(); /* Start watching for new records, if desired. */ - if (pstore_update_ms >= 0) { - pstore_timer.expires = jiffies + - msecs_to_jiffies(pstore_update_ms); - add_timer(&pstore_timer); - } + pstore_timer_kick(); /* * Update the module parameter backend, so it is visible * through /sys/module/pstore/parameters/backend */ - backend = psi->name; + backend = kstrdup(psi->name, GFP_KERNEL); pr_info("Registered %s as persistent store backend\n", psi->name); - module_put(owner); - + mutex_unlock(&psinfo_lock); return 0; } EXPORT_SYMBOL_GPL(pstore_register); void pstore_unregister(struct pstore_info *psi) { - /* Stop timer and make sure all work has finished. */ - pstore_update_ms = -1; - del_timer_sync(&pstore_timer); - flush_work(&pstore_work); + /* It's okay to unregister nothing. */ + if (!psi) + return; + + mutex_lock(&psinfo_lock); + + /* Only one backend can be registered at a time. */ + if (WARN_ON(psi != psinfo)) { + mutex_unlock(&psinfo_lock); + return; + } + /* Unregister all callbacks. */ if (psi->flags & PSTORE_FLAGS_PMSG) pstore_unregister_pmsg(); if (psi->flags & PSTORE_FLAGS_FTRACE) @@ -648,10 +646,19 @@ void pstore_unregister(struct pstore_info *psi) if (psi->flags & PSTORE_FLAGS_DMESG) pstore_unregister_kmsg(); + /* Stop timer and make sure all work has finished. */ + del_timer_sync(&pstore_timer); + flush_work(&pstore_work); + + /* Remove all backend records from filesystem tree. */ + pstore_put_backend_records(psi); + free_buf_for_compression(); psinfo = NULL; + kfree(backend); backend = NULL; + mutex_unlock(&psinfo_lock); } EXPORT_SYMBOL_GPL(pstore_unregister); @@ -788,9 +795,7 @@ static void pstore_timefunc(struct timer_list *unused) schedule_work(&pstore_work); } - if (pstore_update_ms >= 0) - mod_timer(&pstore_timer, - jiffies + msecs_to_jiffies(pstore_update_ms)); + pstore_timer_kick(); } static void __init pstore_choose_compression(void) @@ -835,11 +840,5 @@ static void __exit pstore_exit(void) } module_exit(pstore_exit) -module_param(compress, charp, 0444); -MODULE_PARM_DESC(compress, "Pstore compression to use"); - -module_param(backend, charp, 0444); -MODULE_PARM_DESC(backend, "Pstore backend to use"); - MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>"); MODULE_LICENSE("GPL"); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 795622190c01..ca6d8a867285 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -21,6 +21,7 @@ #include <linux/pstore_ram.h> #include <linux/of.h> #include <linux/of_address.h> +#include "internal.h" #define RAMOOPS_KERNMSG_HDR "====" #define MIN_MEM_SIZE 4096UL @@ -53,22 +54,27 @@ MODULE_PARM_DESC(mem_size, "size of reserved RAM used to store oops/panic logs"); static unsigned int mem_type; -module_param(mem_type, uint, 0600); +module_param(mem_type, uint, 0400); MODULE_PARM_DESC(mem_type, "set to 1 to try to use unbuffered memory (default 0)"); -static int dump_oops = 1; -module_param(dump_oops, int, 0600); -MODULE_PARM_DESC(dump_oops, - "set to 1 to dump oopses, 0 to only dump panics (default 1)"); +static int ramoops_max_reason = -1; +module_param_named(max_reason, ramoops_max_reason, int, 0400); +MODULE_PARM_DESC(max_reason, + "maximum reason for kmsg dump (default 2: Oops and Panic) "); static int ramoops_ecc; -module_param_named(ecc, ramoops_ecc, int, 0600); +module_param_named(ecc, ramoops_ecc, int, 0400); MODULE_PARM_DESC(ramoops_ecc, "if non-zero, the option enables ECC support and specifies " "ECC buffer size in bytes (1 is a special value, means 16 " "bytes ECC)"); +static int ramoops_dump_oops = -1; +module_param_named(dump_oops, ramoops_dump_oops, int, 0400); +MODULE_PARM_DESC(dump_oops, + "(deprecated: use max_reason instead) set to 1 to dump oopses & panics, 0 to only dump panics"); + struct ramoops_context { struct persistent_ram_zone **dprzs; /* Oops dump zones */ struct persistent_ram_zone *cprz; /* Console zone */ @@ -81,7 +87,6 @@ struct ramoops_context { size_t console_size; size_t ftrace_size; size_t pmsg_size; - int dump_oops; u32 flags; struct persistent_ram_ecc_info ecc_info; unsigned int max_dump_cnt; @@ -168,58 +173,6 @@ static bool prz_ok(struct persistent_ram_zone *prz) persistent_ram_ecc_string(prz, NULL, 0)); } -static ssize_t ftrace_log_combine(struct persistent_ram_zone *dest, - struct persistent_ram_zone *src) -{ - size_t dest_size, src_size, total, dest_off, src_off; - size_t dest_idx = 0, src_idx = 0, merged_idx = 0; - void *merged_buf; - struct pstore_ftrace_record *drec, *srec, *mrec; - size_t record_size = sizeof(struct pstore_ftrace_record); - - dest_off = dest->old_log_size % record_size; - dest_size = dest->old_log_size - dest_off; - - src_off = src->old_log_size % record_size; - src_size = src->old_log_size - src_off; - - total = dest_size + src_size; - merged_buf = kmalloc(total, GFP_KERNEL); - if (!merged_buf) - return -ENOMEM; - - drec = (struct pstore_ftrace_record *)(dest->old_log + dest_off); - srec = (struct pstore_ftrace_record *)(src->old_log + src_off); - mrec = (struct pstore_ftrace_record *)(merged_buf); - - while (dest_size > 0 && src_size > 0) { - if (pstore_ftrace_read_timestamp(&drec[dest_idx]) < - pstore_ftrace_read_timestamp(&srec[src_idx])) { - mrec[merged_idx++] = drec[dest_idx++]; - dest_size -= record_size; - } else { - mrec[merged_idx++] = srec[src_idx++]; - src_size -= record_size; - } - } - - while (dest_size > 0) { - mrec[merged_idx++] = drec[dest_idx++]; - dest_size -= record_size; - } - - while (src_size > 0) { - mrec[merged_idx++] = srec[src_idx++]; - src_size -= record_size; - } - - kfree(dest->old_log); - dest->old_log = merged_buf; - dest->old_log_size = total; - - return 0; -} - static ssize_t ramoops_pstore_read(struct pstore_record *record) { ssize_t size = 0; @@ -291,7 +244,12 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record) tmp_prz->corrected_bytes += prz_next->corrected_bytes; tmp_prz->bad_blocks += prz_next->bad_blocks; - size = ftrace_log_combine(tmp_prz, prz_next); + + size = pstore_ftrace_combine_log( + &tmp_prz->old_log, + &tmp_prz->old_log_size, + prz_next->old_log, + prz_next->old_log_size); if (size) goto out; } @@ -382,16 +340,14 @@ static int notrace ramoops_pstore_write(struct pstore_record *record) return -EINVAL; /* - * Out of the various dmesg dump types, ramoops is currently designed - * to only store crash logs, rather than storing general kernel logs. + * We could filter on record->reason here if we wanted to (which + * would duplicate what happened before the "max_reason" setting + * was added), but that would defeat the purpose of a system + * changing printk.always_kmsg_dump, so instead log everything that + * the kmsg dumper sends us, since it should be doing the filtering + * based on the combination of printk.always_kmsg_dump and our + * requested "max_reason". */ - if (record->reason != KMSG_DUMP_OOPS && - record->reason != KMSG_DUMP_PANIC) - return -EINVAL; - - /* Skip Oopes when configured to do so. */ - if (record->reason == KMSG_DUMP_OOPS && !cxt->dump_oops) - return -EINVAL; /* * Explicitly only take the first part of any new crash. @@ -644,19 +600,25 @@ static int ramoops_init_prz(const char *name, return 0; } -static int ramoops_parse_dt_size(struct platform_device *pdev, - const char *propname, u32 *value) +/* Read a u32 from a dt property and make sure it's safe for an int. */ +static int ramoops_parse_dt_u32(struct platform_device *pdev, + const char *propname, + u32 default_value, u32 *value) { u32 val32 = 0; int ret; ret = of_property_read_u32(pdev->dev.of_node, propname, &val32); - if (ret < 0 && ret != -EINVAL) { + if (ret == -EINVAL) { + /* field is missing, use default value. */ + val32 = default_value; + } else if (ret < 0) { dev_err(&pdev->dev, "failed to parse property %s: %d\n", propname, ret); return ret; } + /* Sanity check our results. */ if (val32 > INT_MAX) { dev_err(&pdev->dev, "%s %u > INT_MAX\n", propname, val32); return -EOVERFLOW; @@ -687,23 +649,32 @@ static int ramoops_parse_dt(struct platform_device *pdev, pdata->mem_size = resource_size(res); pdata->mem_address = res->start; pdata->mem_type = of_property_read_bool(of_node, "unbuffered"); - pdata->dump_oops = !of_property_read_bool(of_node, "no-dump-oops"); - -#define parse_size(name, field) { \ - ret = ramoops_parse_dt_size(pdev, name, &value); \ + /* + * Setting "no-dump-oops" is deprecated and will be ignored if + * "max_reason" is also specified. + */ + if (of_property_read_bool(of_node, "no-dump-oops")) + pdata->max_reason = KMSG_DUMP_PANIC; + else + pdata->max_reason = KMSG_DUMP_OOPS; + +#define parse_u32(name, field, default_value) { \ + ret = ramoops_parse_dt_u32(pdev, name, default_value, \ + &value); \ if (ret < 0) \ return ret; \ field = value; \ } - parse_size("record-size", pdata->record_size); - parse_size("console-size", pdata->console_size); - parse_size("ftrace-size", pdata->ftrace_size); - parse_size("pmsg-size", pdata->pmsg_size); - parse_size("ecc-size", pdata->ecc_info.ecc_size); - parse_size("flags", pdata->flags); + parse_u32("record-size", pdata->record_size, 0); + parse_u32("console-size", pdata->console_size, 0); + parse_u32("ftrace-size", pdata->ftrace_size, 0); + parse_u32("pmsg-size", pdata->pmsg_size, 0); + parse_u32("ecc-size", pdata->ecc_info.ecc_size, 0); + parse_u32("flags", pdata->flags, 0); + parse_u32("max-reason", pdata->max_reason, pdata->max_reason); -#undef parse_size +#undef parse_u32 /* * Some old Chromebooks relied on the kernel setting the @@ -785,7 +756,6 @@ static int ramoops_probe(struct platform_device *pdev) cxt->console_size = pdata->console_size; cxt->ftrace_size = pdata->ftrace_size; cxt->pmsg_size = pdata->pmsg_size; - cxt->dump_oops = pdata->dump_oops; cxt->flags = pdata->flags; cxt->ecc_info = pdata->ecc_info; @@ -828,8 +798,10 @@ static int ramoops_probe(struct platform_device *pdev) * the single region size is how to check. */ cxt->pstore.flags = 0; - if (cxt->max_dump_cnt) + if (cxt->max_dump_cnt) { cxt->pstore.flags |= PSTORE_FLAGS_DMESG; + cxt->pstore.max_reason = pdata->max_reason; + } if (cxt->console_size) cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE; if (cxt->max_ftrace_cnt) @@ -865,7 +837,7 @@ static int ramoops_probe(struct platform_device *pdev) mem_size = pdata->mem_size; mem_address = pdata->mem_address; record_size = pdata->record_size; - dump_oops = pdata->dump_oops; + ramoops_max_reason = pdata->max_reason; ramoops_console_size = pdata->console_size; ramoops_pmsg_size = pdata->pmsg_size; ramoops_ftrace_size = pdata->ftrace_size; @@ -948,7 +920,16 @@ static void __init ramoops_register_dummy(void) pdata.console_size = ramoops_console_size; pdata.ftrace_size = ramoops_ftrace_size; pdata.pmsg_size = ramoops_pmsg_size; - pdata.dump_oops = dump_oops; + /* If "max_reason" is set, its value has priority over "dump_oops". */ + if (ramoops_max_reason >= 0) + pdata.max_reason = ramoops_max_reason; + /* Otherwise, if "dump_oops" is set, parse it into "max_reason". */ + else if (ramoops_dump_oops != -1) + pdata.max_reason = ramoops_dump_oops ? KMSG_DUMP_OOPS + : KMSG_DUMP_PANIC; + /* And if neither are explicitly set, use the default. */ + else + pdata.max_reason = KMSG_DUMP_OOPS; pdata.flags = RAMOOPS_FLAG_FTRACE_PER_CPU; /* diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c new file mode 100644 index 000000000000..819428dfa32f --- /dev/null +++ b/fs/pstore/zone.c @@ -0,0 +1,1465 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Provide a pstore intermediate backend, organized into kernel memory + * allocated zones that are then mapped and flushed into a single + * contiguous region on a storage backend of some kind (block, mtd, etc). + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/mount.h> +#include <linux/printk.h> +#include <linux/fs.h> +#include <linux/pstore_zone.h> +#include <linux/kdev_t.h> +#include <linux/device.h> +#include <linux/namei.h> +#include <linux/fcntl.h> +#include <linux/uio.h> +#include <linux/writeback.h> +#include "internal.h" + +/** + * struct psz_head - header of zone to flush to storage + * + * @sig: signature to indicate header (PSZ_SIG xor PSZONE-type value) + * @datalen: length of data in @data + * @start: offset into @data where the beginning of the stored bytes begin + * @data: zone data. + */ +struct psz_buffer { +#define PSZ_SIG (0x43474244) /* DBGC */ + uint32_t sig; + atomic_t datalen; + atomic_t start; + uint8_t data[]; +}; + +/** + * struct psz_kmsg_header - kmsg dump-specific header to flush to storage + * + * @magic: magic num for kmsg dump header + * @time: kmsg dump trigger time + * @compressed: whether conpressed + * @counter: kmsg dump counter + * @reason: the kmsg dump reason (e.g. oops, panic, etc) + * @data: pointer to log data + * + * This is a sub-header for a kmsg dump, trailing after &psz_buffer. + */ +struct psz_kmsg_header { +#define PSTORE_KMSG_HEADER_MAGIC 0x4dfc3ae5 /* Just a random number */ + uint32_t magic; + struct timespec64 time; + bool compressed; + uint32_t counter; + enum kmsg_dump_reason reason; + uint8_t data[]; +}; + +/** + * struct pstore_zone - single stored buffer + * + * @off: zone offset of storage + * @type: front-end type for this zone + * @name: front-end name for this zone + * @buffer: pointer to data buffer managed by this zone + * @oldbuf: pointer to old data buffer + * @buffer_size: bytes in @buffer->data + * @should_recover: whether this zone should recover from storage + * @dirty: whether the data in @buffer dirty + * + * zone structure in memory. + */ +struct pstore_zone { + loff_t off; + const char *name; + enum pstore_type_id type; + + struct psz_buffer *buffer; + struct psz_buffer *oldbuf; + size_t buffer_size; + bool should_recover; + atomic_t dirty; +}; + +/** + * struct psz_context - all about running state of pstore/zone + * + * @kpszs: kmsg dump storage zones + * @ppsz: pmsg storage zone + * @cpsz: console storage zone + * @fpszs: ftrace storage zones + * @kmsg_max_cnt: max count of @kpszs + * @kmsg_read_cnt: counter of total read kmsg dumps + * @kmsg_write_cnt: counter of total kmsg dump writes + * @pmsg_read_cnt: counter of total read pmsg zone + * @console_read_cnt: counter of total read console zone + * @ftrace_max_cnt: max count of @fpszs + * @ftrace_read_cnt: counter of max read ftrace zone + * @oops_counter: counter of oops dumps + * @panic_counter: counter of panic dumps + * @recovered: whether finished recovering data from storage + * @on_panic: whether panic is happening + * @pstore_zone_info_lock: lock to @pstore_zone_info + * @pstore_zone_info: information from backend + * @pstore: structure for pstore + */ +struct psz_context { + struct pstore_zone **kpszs; + struct pstore_zone *ppsz; + struct pstore_zone *cpsz; + struct pstore_zone **fpszs; + unsigned int kmsg_max_cnt; + unsigned int kmsg_read_cnt; + unsigned int kmsg_write_cnt; + unsigned int pmsg_read_cnt; + unsigned int console_read_cnt; + unsigned int ftrace_max_cnt; + unsigned int ftrace_read_cnt; + /* + * These counters should be calculated during recovery. + * It records the oops/panic times after crashes rather than boots. + */ + unsigned int oops_counter; + unsigned int panic_counter; + atomic_t recovered; + atomic_t on_panic; + + /* + * pstore_zone_info_lock protects this entire structure during calls + * to register_pstore_zone()/unregister_pstore_zone(). + */ + struct mutex pstore_zone_info_lock; + struct pstore_zone_info *pstore_zone_info; + struct pstore_info pstore; +}; +static struct psz_context pstore_zone_cxt; + +static void psz_flush_all_dirty_zones(struct work_struct *); +static DECLARE_DELAYED_WORK(psz_cleaner, psz_flush_all_dirty_zones); + +/** + * enum psz_flush_mode - flush mode for psz_zone_write() + * + * @FLUSH_NONE: do not flush to storage but update data on memory + * @FLUSH_PART: just flush part of data including meta data to storage + * @FLUSH_META: just flush meta data of zone to storage + * @FLUSH_ALL: flush all of zone + */ +enum psz_flush_mode { + FLUSH_NONE = 0, + FLUSH_PART, + FLUSH_META, + FLUSH_ALL, +}; + +static inline int buffer_datalen(struct pstore_zone *zone) +{ + return atomic_read(&zone->buffer->datalen); +} + +static inline int buffer_start(struct pstore_zone *zone) +{ + return atomic_read(&zone->buffer->start); +} + +static inline bool is_on_panic(void) +{ + return atomic_read(&pstore_zone_cxt.on_panic); +} + +static ssize_t psz_zone_read_buffer(struct pstore_zone *zone, char *buf, + size_t len, unsigned long off) +{ + if (!buf || !zone || !zone->buffer) + return -EINVAL; + if (off > zone->buffer_size) + return -EINVAL; + len = min_t(size_t, len, zone->buffer_size - off); + memcpy(buf, zone->buffer->data + off, len); + return len; +} + +static int psz_zone_read_oldbuf(struct pstore_zone *zone, char *buf, + size_t len, unsigned long off) +{ + if (!buf || !zone || !zone->oldbuf) + return -EINVAL; + if (off > zone->buffer_size) + return -EINVAL; + len = min_t(size_t, len, zone->buffer_size - off); + memcpy(buf, zone->oldbuf->data + off, len); + return 0; +} + +static int psz_zone_write(struct pstore_zone *zone, + enum psz_flush_mode flush_mode, const char *buf, + size_t len, unsigned long off) +{ + struct pstore_zone_info *info = pstore_zone_cxt.pstore_zone_info; + ssize_t wcnt = 0; + ssize_t (*writeop)(const char *buf, size_t bytes, loff_t pos); + size_t wlen; + + if (off > zone->buffer_size) + return -EINVAL; + + wlen = min_t(size_t, len, zone->buffer_size - off); + if (buf && wlen) { + memcpy(zone->buffer->data + off, buf, wlen); + atomic_set(&zone->buffer->datalen, wlen + off); + } + + /* avoid to damage old records */ + if (!is_on_panic() && !atomic_read(&pstore_zone_cxt.recovered)) + goto dirty; + + writeop = is_on_panic() ? info->panic_write : info->write; + if (!writeop) + goto dirty; + + switch (flush_mode) { + case FLUSH_NONE: + if (unlikely(buf && wlen)) + goto dirty; + return 0; + case FLUSH_PART: + wcnt = writeop((const char *)zone->buffer->data + off, wlen, + zone->off + sizeof(*zone->buffer) + off); + if (wcnt != wlen) + goto dirty; + fallthrough; + case FLUSH_META: + wlen = sizeof(struct psz_buffer); + wcnt = writeop((const char *)zone->buffer, wlen, zone->off); + if (wcnt != wlen) + goto dirty; + break; + case FLUSH_ALL: + wlen = zone->buffer_size + sizeof(*zone->buffer); + wcnt = writeop((const char *)zone->buffer, wlen, zone->off); + if (wcnt != wlen) + goto dirty; + break; + } + + return 0; +dirty: + /* no need to mark dirty if going to try next zone */ + if (wcnt == -ENOMSG) + return -ENOMSG; + atomic_set(&zone->dirty, true); + /* flush dirty zones nicely */ + if (wcnt == -EBUSY && !is_on_panic()) + schedule_delayed_work(&psz_cleaner, msecs_to_jiffies(500)); + return -EBUSY; +} + +static int psz_flush_dirty_zone(struct pstore_zone *zone) +{ + int ret; + + if (unlikely(!zone)) + return -EINVAL; + + if (unlikely(!atomic_read(&pstore_zone_cxt.recovered))) + return -EBUSY; + + if (!atomic_xchg(&zone->dirty, false)) + return 0; + + ret = psz_zone_write(zone, FLUSH_ALL, NULL, 0, 0); + if (ret) + atomic_set(&zone->dirty, true); + return ret; +} + +static int psz_flush_dirty_zones(struct pstore_zone **zones, unsigned int cnt) +{ + int i, ret; + struct pstore_zone *zone; + + if (!zones) + return -EINVAL; + + for (i = 0; i < cnt; i++) { + zone = zones[i]; + if (!zone) + return -EINVAL; + ret = psz_flush_dirty_zone(zone); + if (ret) + return ret; + } + return 0; +} + +static int psz_move_zone(struct pstore_zone *old, struct pstore_zone *new) +{ + const char *data = (const char *)old->buffer->data; + int ret; + + ret = psz_zone_write(new, FLUSH_ALL, data, buffer_datalen(old), 0); + if (ret) { + atomic_set(&new->buffer->datalen, 0); + atomic_set(&new->dirty, false); + return ret; + } + atomic_set(&old->buffer->datalen, 0); + return 0; +} + +static void psz_flush_all_dirty_zones(struct work_struct *work) +{ + struct psz_context *cxt = &pstore_zone_cxt; + int ret = 0; + + if (cxt->ppsz) + ret |= psz_flush_dirty_zone(cxt->ppsz); + if (cxt->cpsz) + ret |= psz_flush_dirty_zone(cxt->cpsz); + if (cxt->kpszs) + ret |= psz_flush_dirty_zones(cxt->kpszs, cxt->kmsg_max_cnt); + if (cxt->fpszs) + ret |= psz_flush_dirty_zones(cxt->fpszs, cxt->ftrace_max_cnt); + if (ret && cxt->pstore_zone_info) + schedule_delayed_work(&psz_cleaner, msecs_to_jiffies(1000)); +} + +static int psz_kmsg_recover_data(struct psz_context *cxt) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + struct pstore_zone *zone = NULL; + struct psz_buffer *buf; + unsigned long i; + ssize_t rcnt; + + if (!info->read) + return -EINVAL; + + for (i = 0; i < cxt->kmsg_max_cnt; i++) { + zone = cxt->kpszs[i]; + if (unlikely(!zone)) + return -EINVAL; + if (atomic_read(&zone->dirty)) { + unsigned int wcnt = cxt->kmsg_write_cnt; + struct pstore_zone *new = cxt->kpszs[wcnt]; + int ret; + + ret = psz_move_zone(zone, new); + if (ret) { + pr_err("move zone from %lu to %d failed\n", + i, wcnt); + return ret; + } + cxt->kmsg_write_cnt = (wcnt + 1) % cxt->kmsg_max_cnt; + } + if (!zone->should_recover) + continue; + buf = zone->buffer; + rcnt = info->read((char *)buf, zone->buffer_size + sizeof(*buf), + zone->off); + if (rcnt != zone->buffer_size + sizeof(*buf)) + return (int)rcnt < 0 ? (int)rcnt : -EIO; + } + return 0; +} + +static int psz_kmsg_recover_meta(struct psz_context *cxt) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + struct pstore_zone *zone; + size_t rcnt, len; + struct psz_buffer *buf; + struct psz_kmsg_header *hdr; + struct timespec64 time = { }; + unsigned long i; + /* + * Recover may on panic, we can't allocate any memory by kmalloc. + * So, we use local array instead. + */ + char buffer_header[sizeof(*buf) + sizeof(*hdr)] = {0}; + + if (!info->read) + return -EINVAL; + + len = sizeof(*buf) + sizeof(*hdr); + buf = (struct psz_buffer *)buffer_header; + for (i = 0; i < cxt->kmsg_max_cnt; i++) { + zone = cxt->kpszs[i]; + if (unlikely(!zone)) + return -EINVAL; + + rcnt = info->read((char *)buf, len, zone->off); + if (rcnt == -ENOMSG) { + pr_debug("%s with id %lu may be broken, skip\n", + zone->name, i); + continue; + } else if (rcnt != len) { + pr_err("read %s with id %lu failed\n", zone->name, i); + return (int)rcnt < 0 ? (int)rcnt : -EIO; + } + + if (buf->sig != zone->buffer->sig) { + pr_debug("no valid data in kmsg dump zone %lu\n", i); + continue; + } + + if (zone->buffer_size < atomic_read(&buf->datalen)) { + pr_info("found overtop zone: %s: id %lu, off %lld, size %zu\n", + zone->name, i, zone->off, + zone->buffer_size); + continue; + } + + hdr = (struct psz_kmsg_header *)buf->data; + if (hdr->magic != PSTORE_KMSG_HEADER_MAGIC) { + pr_info("found invalid zone: %s: id %lu, off %lld, size %zu\n", + zone->name, i, zone->off, + zone->buffer_size); + continue; + } + + /* + * we get the newest zone, and the next one must be the oldest + * or unused zone, because we do write one by one like a circle. + */ + if (hdr->time.tv_sec >= time.tv_sec) { + time.tv_sec = hdr->time.tv_sec; + cxt->kmsg_write_cnt = (i + 1) % cxt->kmsg_max_cnt; + } + + if (hdr->reason == KMSG_DUMP_OOPS) + cxt->oops_counter = + max(cxt->oops_counter, hdr->counter); + else if (hdr->reason == KMSG_DUMP_PANIC) + cxt->panic_counter = + max(cxt->panic_counter, hdr->counter); + + if (!atomic_read(&buf->datalen)) { + pr_debug("found erased zone: %s: id %lu, off %lld, size %zu, datalen %d\n", + zone->name, i, zone->off, + zone->buffer_size, + atomic_read(&buf->datalen)); + continue; + } + + if (!is_on_panic()) + zone->should_recover = true; + pr_debug("found nice zone: %s: id %lu, off %lld, size %zu, datalen %d\n", + zone->name, i, zone->off, + zone->buffer_size, atomic_read(&buf->datalen)); + } + + return 0; +} + +static int psz_kmsg_recover(struct psz_context *cxt) +{ + int ret; + + if (!cxt->kpszs) + return 0; + + ret = psz_kmsg_recover_meta(cxt); + if (ret) + goto recover_fail; + + ret = psz_kmsg_recover_data(cxt); + if (ret) + goto recover_fail; + + return 0; +recover_fail: + pr_debug("psz_recover_kmsg failed\n"); + return ret; +} + +static int psz_recover_zone(struct psz_context *cxt, struct pstore_zone *zone) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + struct psz_buffer *oldbuf, tmpbuf; + int ret = 0; + char *buf; + ssize_t rcnt, len, start, off; + + if (!zone || zone->oldbuf) + return 0; + + if (is_on_panic()) { + /* save data as much as possible */ + psz_flush_dirty_zone(zone); + return 0; + } + + if (unlikely(!info->read)) + return -EINVAL; + + len = sizeof(struct psz_buffer); + rcnt = info->read((char *)&tmpbuf, len, zone->off); + if (rcnt != len) { + pr_debug("read zone %s failed\n", zone->name); + return (int)rcnt < 0 ? (int)rcnt : -EIO; + } + + if (tmpbuf.sig != zone->buffer->sig) { + pr_debug("no valid data in zone %s\n", zone->name); + return 0; + } + + if (zone->buffer_size < atomic_read(&tmpbuf.datalen) || + zone->buffer_size < atomic_read(&tmpbuf.start)) { + pr_info("found overtop zone: %s: off %lld, size %zu\n", + zone->name, zone->off, zone->buffer_size); + /* just keep going */ + return 0; + } + + if (!atomic_read(&tmpbuf.datalen)) { + pr_debug("found erased zone: %s: off %lld, size %zu, datalen %d\n", + zone->name, zone->off, zone->buffer_size, + atomic_read(&tmpbuf.datalen)); + return 0; + } + + pr_debug("found nice zone: %s: off %lld, size %zu, datalen %d\n", + zone->name, zone->off, zone->buffer_size, + atomic_read(&tmpbuf.datalen)); + + len = atomic_read(&tmpbuf.datalen) + sizeof(*oldbuf); + oldbuf = kzalloc(len, GFP_KERNEL); + if (!oldbuf) + return -ENOMEM; + + memcpy(oldbuf, &tmpbuf, sizeof(*oldbuf)); + buf = (char *)oldbuf + sizeof(*oldbuf); + len = atomic_read(&oldbuf->datalen); + start = atomic_read(&oldbuf->start); + off = zone->off + sizeof(*oldbuf); + + /* get part of data */ + rcnt = info->read(buf, len - start, off + start); + if (rcnt != len - start) { + pr_err("read zone %s failed\n", zone->name); + ret = (int)rcnt < 0 ? (int)rcnt : -EIO; + goto free_oldbuf; + } + + /* get the rest of data */ + rcnt = info->read(buf + len - start, start, off); + if (rcnt != start) { + pr_err("read zone %s failed\n", zone->name); + ret = (int)rcnt < 0 ? (int)rcnt : -EIO; + goto free_oldbuf; + } + + zone->oldbuf = oldbuf; + psz_flush_dirty_zone(zone); + return 0; + +free_oldbuf: + kfree(oldbuf); + return ret; +} + +static int psz_recover_zones(struct psz_context *cxt, + struct pstore_zone **zones, unsigned int cnt) +{ + int ret; + unsigned int i; + struct pstore_zone *zone; + + if (!zones) + return 0; + + for (i = 0; i < cnt; i++) { + zone = zones[i]; + if (unlikely(!zone)) + continue; + ret = psz_recover_zone(cxt, zone); + if (ret) + goto recover_fail; + } + + return 0; +recover_fail: + pr_debug("recover %s[%u] failed\n", zone->name, i); + return ret; +} + +/** + * psz_recovery() - recover data from storage + * @cxt: the context of pstore/zone + * + * recovery means reading data back from storage after rebooting + * + * Return: 0 on success, others on failure. + */ +static inline int psz_recovery(struct psz_context *cxt) +{ + int ret; + + if (atomic_read(&cxt->recovered)) + return 0; + + ret = psz_kmsg_recover(cxt); + if (ret) + goto out; + + ret = psz_recover_zone(cxt, cxt->ppsz); + if (ret) + goto out; + + ret = psz_recover_zone(cxt, cxt->cpsz); + if (ret) + goto out; + + ret = psz_recover_zones(cxt, cxt->fpszs, cxt->ftrace_max_cnt); + +out: + if (unlikely(ret)) + pr_err("recover failed\n"); + else { + pr_debug("recover end!\n"); + atomic_set(&cxt->recovered, 1); + } + return ret; +} + +static int psz_pstore_open(struct pstore_info *psi) +{ + struct psz_context *cxt = psi->data; + + cxt->kmsg_read_cnt = 0; + cxt->pmsg_read_cnt = 0; + cxt->console_read_cnt = 0; + cxt->ftrace_read_cnt = 0; + return 0; +} + +static inline bool psz_old_ok(struct pstore_zone *zone) +{ + if (zone && zone->oldbuf && atomic_read(&zone->oldbuf->datalen)) + return true; + return false; +} + +static inline bool psz_ok(struct pstore_zone *zone) +{ + if (zone && zone->buffer && buffer_datalen(zone)) + return true; + return false; +} + +static inline int psz_kmsg_erase(struct psz_context *cxt, + struct pstore_zone *zone, struct pstore_record *record) +{ + struct psz_buffer *buffer = zone->buffer; + struct psz_kmsg_header *hdr = + (struct psz_kmsg_header *)buffer->data; + size_t size; + + if (unlikely(!psz_ok(zone))) + return 0; + + /* this zone is already updated, no need to erase */ + if (record->count != hdr->counter) + return 0; + + size = buffer_datalen(zone) + sizeof(*zone->buffer); + atomic_set(&zone->buffer->datalen, 0); + if (cxt->pstore_zone_info->erase) + return cxt->pstore_zone_info->erase(size, zone->off); + else + return psz_zone_write(zone, FLUSH_META, NULL, 0, 0); +} + +static inline int psz_record_erase(struct psz_context *cxt, + struct pstore_zone *zone) +{ + if (unlikely(!psz_old_ok(zone))) + return 0; + + kfree(zone->oldbuf); + zone->oldbuf = NULL; + /* + * if there are new data in zone buffer, that means the old data + * are already invalid. It is no need to flush 0 (erase) to + * block device. + */ + if (!buffer_datalen(zone)) + return psz_zone_write(zone, FLUSH_META, NULL, 0, 0); + psz_flush_dirty_zone(zone); + return 0; +} + +static int psz_pstore_erase(struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + + switch (record->type) { + case PSTORE_TYPE_DMESG: + if (record->id >= cxt->kmsg_max_cnt) + return -EINVAL; + return psz_kmsg_erase(cxt, cxt->kpszs[record->id], record); + case PSTORE_TYPE_PMSG: + return psz_record_erase(cxt, cxt->ppsz); + case PSTORE_TYPE_CONSOLE: + return psz_record_erase(cxt, cxt->cpsz); + case PSTORE_TYPE_FTRACE: + if (record->id >= cxt->ftrace_max_cnt) + return -EINVAL; + return psz_record_erase(cxt, cxt->fpszs[record->id]); + default: return -EINVAL; + } +} + +static void psz_write_kmsg_hdr(struct pstore_zone *zone, + struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + struct psz_buffer *buffer = zone->buffer; + struct psz_kmsg_header *hdr = + (struct psz_kmsg_header *)buffer->data; + + hdr->magic = PSTORE_KMSG_HEADER_MAGIC; + hdr->compressed = record->compressed; + hdr->time.tv_sec = record->time.tv_sec; + hdr->time.tv_nsec = record->time.tv_nsec; + hdr->reason = record->reason; + if (hdr->reason == KMSG_DUMP_OOPS) + hdr->counter = ++cxt->oops_counter; + else if (hdr->reason == KMSG_DUMP_PANIC) + hdr->counter = ++cxt->panic_counter; + else + hdr->counter = 0; +} + +/* + * In case zone is broken, which may occur to MTD device, we try each zones, + * start at cxt->kmsg_write_cnt. + */ +static inline int notrace psz_kmsg_write_record(struct psz_context *cxt, + struct pstore_record *record) +{ + size_t size, hlen; + struct pstore_zone *zone; + unsigned int i; + + for (i = 0; i < cxt->kmsg_max_cnt; i++) { + unsigned int zonenum, len; + int ret; + + zonenum = (cxt->kmsg_write_cnt + i) % cxt->kmsg_max_cnt; + zone = cxt->kpszs[zonenum]; + if (unlikely(!zone)) + return -ENOSPC; + + /* avoid destroying old data, allocate a new one */ + len = zone->buffer_size + sizeof(*zone->buffer); + zone->oldbuf = zone->buffer; + zone->buffer = kzalloc(len, GFP_KERNEL); + if (!zone->buffer) { + zone->buffer = zone->oldbuf; + return -ENOMEM; + } + zone->buffer->sig = zone->oldbuf->sig; + + pr_debug("write %s to zone id %d\n", zone->name, zonenum); + psz_write_kmsg_hdr(zone, record); + hlen = sizeof(struct psz_kmsg_header); + size = min_t(size_t, record->size, zone->buffer_size - hlen); + ret = psz_zone_write(zone, FLUSH_ALL, record->buf, size, hlen); + if (likely(!ret || ret != -ENOMSG)) { + cxt->kmsg_write_cnt = zonenum + 1; + cxt->kmsg_write_cnt %= cxt->kmsg_max_cnt; + /* no need to try next zone, free last zone buffer */ + kfree(zone->oldbuf); + zone->oldbuf = NULL; + return ret; + } + + pr_debug("zone %u may be broken, try next dmesg zone\n", + zonenum); + kfree(zone->buffer); + zone->buffer = zone->oldbuf; + zone->oldbuf = NULL; + } + + return -EBUSY; +} + +static int notrace psz_kmsg_write(struct psz_context *cxt, + struct pstore_record *record) +{ + int ret; + + /* + * Explicitly only take the first part of any new crash. + * If our buffer is larger than kmsg_bytes, this can never happen, + * and if our buffer is smaller than kmsg_bytes, we don't want the + * report split across multiple records. + */ + if (record->part != 1) + return -ENOSPC; + + if (!cxt->kpszs) + return -ENOSPC; + + ret = psz_kmsg_write_record(cxt, record); + if (!ret && is_on_panic()) { + /* ensure all data are flushed to storage when panic */ + pr_debug("try to flush other dirty zones\n"); + psz_flush_all_dirty_zones(NULL); + } + + /* always return 0 as we had handled it on buffer */ + return 0; +} + +static int notrace psz_record_write(struct pstore_zone *zone, + struct pstore_record *record) +{ + size_t start, rem; + bool is_full_data = false; + char *buf; + int cnt; + + if (!zone || !record) + return -ENOSPC; + + if (atomic_read(&zone->buffer->datalen) >= zone->buffer_size) + is_full_data = true; + + cnt = record->size; + buf = record->buf; + if (unlikely(cnt > zone->buffer_size)) { + buf += cnt - zone->buffer_size; + cnt = zone->buffer_size; + } + + start = buffer_start(zone); + rem = zone->buffer_size - start; + if (unlikely(rem < cnt)) { + psz_zone_write(zone, FLUSH_PART, buf, rem, start); + buf += rem; + cnt -= rem; + start = 0; + is_full_data = true; + } + + atomic_set(&zone->buffer->start, cnt + start); + psz_zone_write(zone, FLUSH_PART, buf, cnt, start); + + /** + * psz_zone_write will set datalen as start + cnt. + * It work if actual data length lesser than buffer size. + * If data length greater than buffer size, pmsg will rewrite to + * beginning of zone, which make buffer->datalen wrongly. + * So we should reset datalen as buffer size once actual data length + * greater than buffer size. + */ + if (is_full_data) { + atomic_set(&zone->buffer->datalen, zone->buffer_size); + psz_zone_write(zone, FLUSH_META, NULL, 0, 0); + } + return 0; +} + +static int notrace psz_pstore_write(struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + + if (record->type == PSTORE_TYPE_DMESG && + record->reason == KMSG_DUMP_PANIC) + atomic_set(&cxt->on_panic, 1); + + /* + * if on panic, do not write except panic records + * Fix case that panic_write prints log which wakes up console backend. + */ + if (is_on_panic() && record->type != PSTORE_TYPE_DMESG) + return -EBUSY; + + switch (record->type) { + case PSTORE_TYPE_DMESG: + return psz_kmsg_write(cxt, record); + case PSTORE_TYPE_CONSOLE: + return psz_record_write(cxt->cpsz, record); + case PSTORE_TYPE_PMSG: + return psz_record_write(cxt->ppsz, record); + case PSTORE_TYPE_FTRACE: { + int zonenum = smp_processor_id(); + + if (!cxt->fpszs) + return -ENOSPC; + return psz_record_write(cxt->fpszs[zonenum], record); + } + default: + return -EINVAL; + } +} + +static struct pstore_zone *psz_read_next_zone(struct psz_context *cxt) +{ + struct pstore_zone *zone = NULL; + + while (cxt->kmsg_read_cnt < cxt->kmsg_max_cnt) { + zone = cxt->kpszs[cxt->kmsg_read_cnt++]; + if (psz_ok(zone)) + return zone; + } + + if (cxt->ftrace_read_cnt < cxt->ftrace_max_cnt) + /* + * No need psz_old_ok(). Let psz_ftrace_read() do so for + * combination. psz_ftrace_read() should traverse over + * all zones in case of some zone without data. + */ + return cxt->fpszs[cxt->ftrace_read_cnt++]; + + if (cxt->pmsg_read_cnt == 0) { + cxt->pmsg_read_cnt++; + zone = cxt->ppsz; + if (psz_old_ok(zone)) + return zone; + } + + if (cxt->console_read_cnt == 0) { + cxt->console_read_cnt++; + zone = cxt->cpsz; + if (psz_old_ok(zone)) + return zone; + } + + return NULL; +} + +static int psz_kmsg_read_hdr(struct pstore_zone *zone, + struct pstore_record *record) +{ + struct psz_buffer *buffer = zone->buffer; + struct psz_kmsg_header *hdr = + (struct psz_kmsg_header *)buffer->data; + + if (hdr->magic != PSTORE_KMSG_HEADER_MAGIC) + return -EINVAL; + record->compressed = hdr->compressed; + record->time.tv_sec = hdr->time.tv_sec; + record->time.tv_nsec = hdr->time.tv_nsec; + record->reason = hdr->reason; + record->count = hdr->counter; + return 0; +} + +static ssize_t psz_kmsg_read(struct pstore_zone *zone, + struct pstore_record *record) +{ + ssize_t size, hlen = 0; + + size = buffer_datalen(zone); + /* Clear and skip this kmsg dump record if it has no valid header */ + if (psz_kmsg_read_hdr(zone, record)) { + atomic_set(&zone->buffer->datalen, 0); + atomic_set(&zone->dirty, 0); + return -ENOMSG; + } + size -= sizeof(struct psz_kmsg_header); + + if (!record->compressed) { + char *buf = kasprintf(GFP_KERNEL, "%s: Total %d times\n", + kmsg_dump_reason_str(record->reason), + record->count); + hlen = strlen(buf); + record->buf = krealloc(buf, hlen + size, GFP_KERNEL); + if (!record->buf) { + kfree(buf); + return -ENOMEM; + } + } else { + record->buf = kmalloc(size, GFP_KERNEL); + if (!record->buf) + return -ENOMEM; + } + + size = psz_zone_read_buffer(zone, record->buf + hlen, size, + sizeof(struct psz_kmsg_header)); + if (unlikely(size < 0)) { + kfree(record->buf); + return -ENOMSG; + } + + return size + hlen; +} + +/* try to combine all ftrace zones */ +static ssize_t psz_ftrace_read(struct pstore_zone *zone, + struct pstore_record *record) +{ + struct psz_context *cxt; + struct psz_buffer *buf; + int ret; + + if (!zone || !record) + return -ENOSPC; + + if (!psz_old_ok(zone)) + goto out; + + buf = (struct psz_buffer *)zone->oldbuf; + if (!buf) + return -ENOMSG; + + ret = pstore_ftrace_combine_log(&record->buf, &record->size, + (char *)buf->data, atomic_read(&buf->datalen)); + if (unlikely(ret)) + return ret; + +out: + cxt = record->psi->data; + if (cxt->ftrace_read_cnt < cxt->ftrace_max_cnt) + /* then, read next ftrace zone */ + return -ENOMSG; + record->id = 0; + return record->size ? record->size : -ENOMSG; +} + +static ssize_t psz_record_read(struct pstore_zone *zone, + struct pstore_record *record) +{ + size_t len; + struct psz_buffer *buf; + + if (!zone || !record) + return -ENOSPC; + + buf = (struct psz_buffer *)zone->oldbuf; + if (!buf) + return -ENOMSG; + + len = atomic_read(&buf->datalen); + record->buf = kmalloc(len, GFP_KERNEL); + if (!record->buf) + return -ENOMEM; + + if (unlikely(psz_zone_read_oldbuf(zone, record->buf, len, 0))) { + kfree(record->buf); + return -ENOMSG; + } + + return len; +} + +static ssize_t psz_pstore_read(struct pstore_record *record) +{ + struct psz_context *cxt = record->psi->data; + ssize_t (*readop)(struct pstore_zone *zone, + struct pstore_record *record); + struct pstore_zone *zone; + ssize_t ret; + + /* before read, we must recover from storage */ + ret = psz_recovery(cxt); + if (ret) + return ret; + +next_zone: + zone = psz_read_next_zone(cxt); + if (!zone) + return 0; + + record->type = zone->type; + switch (record->type) { + case PSTORE_TYPE_DMESG: + readop = psz_kmsg_read; + record->id = cxt->kmsg_read_cnt - 1; + break; + case PSTORE_TYPE_FTRACE: + readop = psz_ftrace_read; + break; + case PSTORE_TYPE_CONSOLE: + fallthrough; + case PSTORE_TYPE_PMSG: + readop = psz_record_read; + break; + default: + goto next_zone; + } + + ret = readop(zone, record); + if (ret == -ENOMSG) + goto next_zone; + return ret; +} + +static struct psz_context pstore_zone_cxt = { + .pstore_zone_info_lock = + __MUTEX_INITIALIZER(pstore_zone_cxt.pstore_zone_info_lock), + .recovered = ATOMIC_INIT(0), + .on_panic = ATOMIC_INIT(0), + .pstore = { + .owner = THIS_MODULE, + .open = psz_pstore_open, + .read = psz_pstore_read, + .write = psz_pstore_write, + .erase = psz_pstore_erase, + }, +}; + +static void psz_free_zone(struct pstore_zone **pszone) +{ + struct pstore_zone *zone = *pszone; + + if (!zone) + return; + + kfree(zone->buffer); + kfree(zone); + *pszone = NULL; +} + +static void psz_free_zones(struct pstore_zone ***pszones, unsigned int *cnt) +{ + struct pstore_zone **zones = *pszones; + + if (!zones) + return; + + while (*cnt > 0) { + (*cnt)--; + psz_free_zone(&(zones[*cnt])); + } + kfree(zones); + *pszones = NULL; +} + +static void psz_free_all_zones(struct psz_context *cxt) +{ + if (cxt->kpszs) + psz_free_zones(&cxt->kpszs, &cxt->kmsg_max_cnt); + if (cxt->ppsz) + psz_free_zone(&cxt->ppsz); + if (cxt->cpsz) + psz_free_zone(&cxt->cpsz); + if (cxt->fpszs) + psz_free_zones(&cxt->fpszs, &cxt->ftrace_max_cnt); +} + +static struct pstore_zone *psz_init_zone(enum pstore_type_id type, + loff_t *off, size_t size) +{ + struct pstore_zone_info *info = pstore_zone_cxt.pstore_zone_info; + struct pstore_zone *zone; + const char *name = pstore_type_to_name(type); + + if (!size) + return NULL; + + if (*off + size > info->total_size) { + pr_err("no room for %s (0x%zx@0x%llx over 0x%lx)\n", + name, size, *off, info->total_size); + return ERR_PTR(-ENOMEM); + } + + zone = kzalloc(sizeof(struct pstore_zone), GFP_KERNEL); + if (!zone) + return ERR_PTR(-ENOMEM); + + zone->buffer = kmalloc(size, GFP_KERNEL); + if (!zone->buffer) { + kfree(zone); + return ERR_PTR(-ENOMEM); + } + memset(zone->buffer, 0xFF, size); + zone->off = *off; + zone->name = name; + zone->type = type; + zone->buffer_size = size - sizeof(struct psz_buffer); + zone->buffer->sig = type ^ PSZ_SIG; + zone->oldbuf = NULL; + atomic_set(&zone->dirty, 0); + atomic_set(&zone->buffer->datalen, 0); + atomic_set(&zone->buffer->start, 0); + + *off += size; + + pr_debug("pszone %s: off 0x%llx, %zu header, %zu data\n", zone->name, + zone->off, sizeof(*zone->buffer), zone->buffer_size); + return zone; +} + +static struct pstore_zone **psz_init_zones(enum pstore_type_id type, + loff_t *off, size_t total_size, ssize_t record_size, + unsigned int *cnt) +{ + struct pstore_zone_info *info = pstore_zone_cxt.pstore_zone_info; + struct pstore_zone **zones, *zone; + const char *name = pstore_type_to_name(type); + int c, i; + + *cnt = 0; + if (!total_size || !record_size) + return NULL; + + if (*off + total_size > info->total_size) { + pr_err("no room for zones %s (0x%zx@0x%llx over 0x%lx)\n", + name, total_size, *off, info->total_size); + return ERR_PTR(-ENOMEM); + } + + c = total_size / record_size; + zones = kcalloc(c, sizeof(*zones), GFP_KERNEL); + if (!zones) { + pr_err("allocate for zones %s failed\n", name); + return ERR_PTR(-ENOMEM); + } + memset(zones, 0, c * sizeof(*zones)); + + for (i = 0; i < c; i++) { + zone = psz_init_zone(type, off, record_size); + if (!zone || IS_ERR(zone)) { + pr_err("initialize zones %s failed\n", name); + psz_free_zones(&zones, &i); + return (void *)zone; + } + zones[i] = zone; + } + + *cnt = c; + return zones; +} + +static int psz_alloc_zones(struct psz_context *cxt) +{ + struct pstore_zone_info *info = cxt->pstore_zone_info; + loff_t off = 0; + int err; + size_t off_size = 0; + + off_size += info->pmsg_size; + cxt->ppsz = psz_init_zone(PSTORE_TYPE_PMSG, &off, info->pmsg_size); + if (IS_ERR(cxt->ppsz)) { + err = PTR_ERR(cxt->ppsz); + cxt->ppsz = NULL; + goto free_out; + } + + off_size += info->console_size; + cxt->cpsz = psz_init_zone(PSTORE_TYPE_CONSOLE, &off, + info->console_size); + if (IS_ERR(cxt->cpsz)) { + err = PTR_ERR(cxt->cpsz); + cxt->cpsz = NULL; + goto free_out; + } + + off_size += info->ftrace_size; + cxt->fpszs = psz_init_zones(PSTORE_TYPE_FTRACE, &off, + info->ftrace_size, + info->ftrace_size / nr_cpu_ids, + &cxt->ftrace_max_cnt); + if (IS_ERR(cxt->fpszs)) { + err = PTR_ERR(cxt->fpszs); + cxt->fpszs = NULL; + goto free_out; + } + + cxt->kpszs = psz_init_zones(PSTORE_TYPE_DMESG, &off, + info->total_size - off_size, + info->kmsg_size, &cxt->kmsg_max_cnt); + if (IS_ERR(cxt->kpszs)) { + err = PTR_ERR(cxt->kpszs); + cxt->kpszs = NULL; + goto free_out; + } + + return 0; +free_out: + psz_free_all_zones(cxt); + return err; +} + +/** + * register_pstore_zone() - register to pstore/zone + * + * @info: back-end driver information. See &struct pstore_zone_info. + * + * Only one back-end at one time. + * + * Return: 0 on success, others on failure. + */ +int register_pstore_zone(struct pstore_zone_info *info) +{ + int err = -EINVAL; + struct psz_context *cxt = &pstore_zone_cxt; + + if (info->total_size < 4096) { + pr_warn("total_size must be >= 4096\n"); + return -EINVAL; + } + + if (!info->kmsg_size && !info->pmsg_size && !info->console_size && + !info->ftrace_size) { + pr_warn("at least one record size must be non-zero\n"); + return -EINVAL; + } + + if (!info->name || !info->name[0]) + return -EINVAL; + +#define check_size(name, size) { \ + if (info->name > 0 && info->name < (size)) { \ + pr_err(#name " must be over %d\n", (size)); \ + return -EINVAL; \ + } \ + if (info->name & (size - 1)) { \ + pr_err(#name " must be a multiple of %d\n", \ + (size)); \ + return -EINVAL; \ + } \ + } + + check_size(total_size, 4096); + check_size(kmsg_size, SECTOR_SIZE); + check_size(pmsg_size, SECTOR_SIZE); + check_size(console_size, SECTOR_SIZE); + check_size(ftrace_size, SECTOR_SIZE); + +#undef check_size + + /* + * the @read and @write must be applied. + * if no @read, pstore may mount failed. + * if no @write, pstore do not support to remove record file. + */ + if (!info->read || !info->write) { + pr_err("no valid general read/write interface\n"); + return -EINVAL; + } + + mutex_lock(&cxt->pstore_zone_info_lock); + if (cxt->pstore_zone_info) { + pr_warn("'%s' already loaded: ignoring '%s'\n", + cxt->pstore_zone_info->name, info->name); + mutex_unlock(&cxt->pstore_zone_info_lock); + return -EBUSY; + } + cxt->pstore_zone_info = info; + + pr_debug("register %s with properties:\n", info->name); + pr_debug("\ttotal size : %ld Bytes\n", info->total_size); + pr_debug("\tkmsg size : %ld Bytes\n", info->kmsg_size); + pr_debug("\tpmsg size : %ld Bytes\n", info->pmsg_size); + pr_debug("\tconsole size : %ld Bytes\n", info->console_size); + pr_debug("\tftrace size : %ld Bytes\n", info->ftrace_size); + + err = psz_alloc_zones(cxt); + if (err) { + pr_err("alloc zones failed\n"); + goto fail_out; + } + + if (info->kmsg_size) { + cxt->pstore.bufsize = cxt->kpszs[0]->buffer_size - + sizeof(struct psz_kmsg_header); + cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL); + if (!cxt->pstore.buf) { + err = -ENOMEM; + goto fail_free; + } + } + cxt->pstore.data = cxt; + + pr_info("registered %s as backend for", info->name); + cxt->pstore.max_reason = info->max_reason; + cxt->pstore.name = info->name; + if (info->kmsg_size) { + cxt->pstore.flags |= PSTORE_FLAGS_DMESG; + pr_cont(" kmsg(%s", + kmsg_dump_reason_str(cxt->pstore.max_reason)); + if (cxt->pstore_zone_info->panic_write) + pr_cont(",panic_write"); + pr_cont(")"); + } + if (info->pmsg_size) { + cxt->pstore.flags |= PSTORE_FLAGS_PMSG; + pr_cont(" pmsg"); + } + if (info->console_size) { + cxt->pstore.flags |= PSTORE_FLAGS_CONSOLE; + pr_cont(" console"); + } + if (info->ftrace_size) { + cxt->pstore.flags |= PSTORE_FLAGS_FTRACE; + pr_cont(" ftrace"); + } + pr_cont("\n"); + + err = pstore_register(&cxt->pstore); + if (err) { + pr_err("registering with pstore failed\n"); + goto fail_free; + } + mutex_unlock(&pstore_zone_cxt.pstore_zone_info_lock); + + return 0; + +fail_free: + kfree(cxt->pstore.buf); + cxt->pstore.buf = NULL; + cxt->pstore.bufsize = 0; + psz_free_all_zones(cxt); +fail_out: + pstore_zone_cxt.pstore_zone_info = NULL; + mutex_unlock(&pstore_zone_cxt.pstore_zone_info_lock); + return err; +} +EXPORT_SYMBOL_GPL(register_pstore_zone); + +/** + * unregister_pstore_zone() - unregister to pstore/zone + * + * @info: back-end driver information. See struct pstore_zone_info. + */ +void unregister_pstore_zone(struct pstore_zone_info *info) +{ + struct psz_context *cxt = &pstore_zone_cxt; + + mutex_lock(&cxt->pstore_zone_info_lock); + if (!cxt->pstore_zone_info) { + mutex_unlock(&cxt->pstore_zone_info_lock); + return; + } + + /* Stop incoming writes from pstore. */ + pstore_unregister(&cxt->pstore); + + /* Flush any pending writes. */ + psz_flush_all_dirty_zones(NULL); + flush_delayed_work(&psz_cleaner); + + /* Clean up allocations. */ + kfree(cxt->pstore.buf); + cxt->pstore.buf = NULL; + cxt->pstore.bufsize = 0; + cxt->pstore_zone_info = NULL; + + psz_free_all_zones(cxt); + + /* Clear counters and zone state. */ + cxt->oops_counter = 0; + cxt->panic_counter = 0; + atomic_set(&cxt->recovered, 0); + atomic_set(&cxt->on_panic, 0); + + mutex_unlock(&cxt->pstore_zone_info_lock); +} +EXPORT_SYMBOL_GPL(unregister_pstore_zone); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("WeiXiong Liao <liaoweixiong@allwinnertech.com>"); +MODULE_AUTHOR("Kees Cook <keescook@chromium.org>"); +MODULE_DESCRIPTION("Storage Manager for pstore/blk"); diff --git a/fs/splice.c b/fs/splice.c index 4735defc46ee..4e53efbd621d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1118,6 +1118,10 @@ long do_splice(struct file *in, loff_t __user *off_in, loff_t offset; long ret; + if (unlikely(!(in->f_mode & FMODE_READ) || + !(out->f_mode & FMODE_WRITE))) + return -EBADF; + ipipe = get_pipe_info(in); opipe = get_pipe_info(out); @@ -1125,12 +1129,6 @@ long do_splice(struct file *in, loff_t __user *off_in, if (off_in || off_out) return -ESPIPE; - if (!(in->f_mode & FMODE_READ)) - return -EBADF; - - if (!(out->f_mode & FMODE_WRITE)) - return -EBADF; - /* Splicing to self would be fun, but... */ if (ipipe == opipe) return -EINVAL; @@ -1153,9 +1151,6 @@ long do_splice(struct file *in, loff_t __user *off_in, offset = out->f_pos; } - if (unlikely(!(out->f_mode & FMODE_WRITE))) - return -EBADF; - if (unlikely(out->f_flags & O_APPEND)) return -EINVAL; @@ -1440,15 +1435,11 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, error = -EBADF; in = fdget(fd_in); if (in.file) { - if (in.file->f_mode & FMODE_READ) { - out = fdget(fd_out); - if (out.file) { - if (out.file->f_mode & FMODE_WRITE) - error = do_splice(in.file, off_in, - out.file, off_out, - len, flags); - fdput(out); - } + out = fdget(fd_out); + if (out.file) { + error = do_splice(in.file, off_in, out.file, off_out, + len, flags); + fdput(out); } fdput(in); } @@ -1503,7 +1494,7 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) * Check pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) return 0; ret = 0; @@ -1770,6 +1761,10 @@ static long do_tee(struct file *in, struct file *out, size_t len, struct pipe_inode_info *opipe = get_pipe_info(out); int ret = -EINVAL; + if (unlikely(!(in->f_mode & FMODE_READ) || + !(out->f_mode & FMODE_WRITE))) + return -EBADF; + /* * Duplicate the contents of ipipe to opipe without actually * copying the data. @@ -1795,7 +1790,7 @@ static long do_tee(struct file *in, struct file *out, size_t len, SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) { - struct fd in; + struct fd in, out; int error; if (unlikely(flags & ~SPLICE_F_ALL)) @@ -1807,14 +1802,10 @@ SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) error = -EBADF; in = fdget(fdin); if (in.file) { - if (in.file->f_mode & FMODE_READ) { - struct fd out = fdget(fdout); - if (out.file) { - if (out.file->f_mode & FMODE_WRITE) - error = do_tee(in.file, out.file, - len, flags); - fdput(out); - } + out = fdget(fdout); + if (out.file) { + error = do_tee(in.file, out.file, len, flags); + fdput(out); } fdput(in); } diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c index 2a2a2d106440..e206ebfe003c 100644 --- a/fs/squashfs/decompressor_multi_percpu.c +++ b/fs/squashfs/decompressor_multi_percpu.c @@ -8,6 +8,7 @@ #include <linux/slab.h> #include <linux/percpu.h> #include <linux/buffer_head.h> +#include <linux/local_lock.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -20,7 +21,8 @@ */ struct squashfs_stream { - void *stream; + void *stream; + local_lock_t lock; }; void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, @@ -41,6 +43,7 @@ void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, err = PTR_ERR(stream->stream); goto out; } + local_lock_init(&stream->lock); } kfree(comp_opts); @@ -75,12 +78,16 @@ void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk) int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, int b, int offset, int length, struct squashfs_page_actor *output) { - struct squashfs_stream __percpu *percpu = - (struct squashfs_stream __percpu *) msblk->stream; - struct squashfs_stream *stream = get_cpu_ptr(percpu); - int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, - offset, length, output); - put_cpu_ptr(stream); + struct squashfs_stream *stream; + int res; + + local_lock(&msblk->stream->lock); + stream = this_cpu_ptr(msblk->stream); + + res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, + offset, length, output); + + local_unlock(&msblk->stream->lock); if (res < 0) ERROR("%s decompression failed, data probably corrupt\n", diff --git a/fs/super.c b/fs/super.c index cd352530eca9..a288cd60d2ae 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1302,8 +1302,8 @@ int get_tree_bdev(struct fs_context *fc, mutex_lock(&bdev->bd_fsfreeze_mutex); if (bdev->bd_fsfreeze_count > 0) { mutex_unlock(&bdev->bd_fsfreeze_mutex); - blkdev_put(bdev, mode); warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); + blkdev_put(bdev, mode); return -EBUSY; } diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c index 8cdbd53d780c..cc5c0abfd536 100644 --- a/fs/ubifs/auth.c +++ b/fs/ubifs/auth.c @@ -31,15 +31,9 @@ int __ubifs_node_calc_hash(const struct ubifs_info *c, const void *node, u8 *hash) { const struct ubifs_ch *ch = node; - SHASH_DESC_ON_STACK(shash, c->hash_tfm); - int err; - - shash->tfm = c->hash_tfm; - err = crypto_shash_digest(shash, node, le32_to_cpu(ch->len), hash); - if (err < 0) - return err; - return 0; + return crypto_shash_tfm_digest(c->hash_tfm, node, le32_to_cpu(ch->len), + hash); } /** @@ -53,15 +47,7 @@ int __ubifs_node_calc_hash(const struct ubifs_info *c, const void *node, static int ubifs_hash_calc_hmac(const struct ubifs_info *c, const u8 *hash, u8 *hmac) { - SHASH_DESC_ON_STACK(shash, c->hmac_tfm); - int err; - - shash->tfm = c->hmac_tfm; - - err = crypto_shash_digest(shash, hash, c->hash_len, hmac); - if (err < 0) - return err; - return 0; + return crypto_shash_tfm_digest(c->hmac_tfm, hash, c->hash_len, hmac); } /** @@ -79,13 +65,9 @@ int ubifs_prepare_auth_node(struct ubifs_info *c, void *node, struct shash_desc *inhash) { struct ubifs_auth_node *auth = node; - u8 *hash; + u8 hash[UBIFS_HASH_ARR_SZ]; int err; - hash = kmalloc(crypto_shash_descsize(c->hash_tfm), GFP_NOFS); - if (!hash) - return -ENOMEM; - { SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm); @@ -94,21 +76,16 @@ int ubifs_prepare_auth_node(struct ubifs_info *c, void *node, err = crypto_shash_final(hash_desc, hash); if (err) - goto out; + return err; } err = ubifs_hash_calc_hmac(c, hash, auth->hmac); if (err) - goto out; + return err; auth->ch.node_type = UBIFS_AUTH_NODE; ubifs_prepare_node(c, auth, ubifs_auth_node_sz(c), 0); - - err = 0; -out: - kfree(hash); - - return err; + return 0; } static struct shash_desc *ubifs_get_desc(const struct ubifs_info *c, diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 743928efffc1..49fe062ce45e 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1375,7 +1375,6 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time, struct ubifs_info *c = inode->i_sb->s_fs_info; struct ubifs_budget_req req = { .dirtied_ino = 1, .dirtied_ino_d = ALIGN(ui->data_len, 8) }; - int iflags = I_DIRTY_TIME; int err, release; if (!IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT)) @@ -1393,11 +1392,8 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time, if (flags & S_MTIME) inode->i_mtime = *time; - if (!(inode->i_sb->s_flags & SB_LAZYTIME)) - iflags |= I_DIRTY_SYNC; - release = ui->dirty; - __mark_inode_dirty(inode, iflags); + __mark_inode_dirty(inode, I_DIRTY_SYNC); mutex_unlock(&ui->ui_mutex); if (release) ubifs_release_budget(c, &req); diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index 52a85c01397e..911d0555b9f2 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -68,12 +68,9 @@ static int mst_node_check_hash(const struct ubifs_info *c, u8 calc[UBIFS_MAX_HASH_LEN]; const void *node = mst; - SHASH_DESC_ON_STACK(shash, c->hash_tfm); - - shash->tfm = c->hash_tfm; - - crypto_shash_digest(shash, node + sizeof(struct ubifs_ch), - UBIFS_MST_NODE_SZ - sizeof(struct ubifs_ch), calc); + crypto_shash_tfm_digest(c->hash_tfm, node + sizeof(struct ubifs_ch), + UBIFS_MST_NODE_SZ - sizeof(struct ubifs_ch), + calc); if (ubifs_check_hash(c, expected, calc)) return -EPERM; diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index b28ac4dfb407..b69ffac7e415 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -558,7 +558,7 @@ static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud) return data == 0xFFFFFFFF; } -/* authenticate_sleb_hash and authenticate_sleb_hmac are split out for stack usage */ +/* authenticate_sleb_hash is split out for stack usage */ static int authenticate_sleb_hash(struct ubifs_info *c, struct shash_desc *log_hash, u8 *hash) { SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm); @@ -569,15 +569,6 @@ static int authenticate_sleb_hash(struct ubifs_info *c, struct shash_desc *log_h return crypto_shash_final(hash_desc, hash); } -static int authenticate_sleb_hmac(struct ubifs_info *c, u8 *hash, u8 *hmac) -{ - SHASH_DESC_ON_STACK(hmac_desc, c->hmac_tfm); - - hmac_desc->tfm = c->hmac_tfm; - - return crypto_shash_digest(hmac_desc, hash, c->hash_len, hmac); -} - /** * authenticate_sleb - authenticate one scan LEB * @c: UBIFS file-system description object @@ -601,18 +592,12 @@ static int authenticate_sleb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, struct ubifs_scan_node *snod; int n_nodes = 0; int err; - u8 *hash, *hmac; + u8 hash[UBIFS_HASH_ARR_SZ]; + u8 hmac[UBIFS_HMAC_ARR_SZ]; if (!ubifs_authenticated(c)) return sleb->nodes_cnt; - hash = kmalloc(crypto_shash_descsize(c->hash_tfm), GFP_NOFS); - hmac = kmalloc(c->hmac_desc_len, GFP_NOFS); - if (!hash || !hmac) { - err = -ENOMEM; - goto out; - } - list_for_each_entry(snod, &sleb->nodes, list) { n_nodes++; @@ -624,7 +609,8 @@ static int authenticate_sleb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, if (err) goto out; - err = authenticate_sleb_hmac(c, hash, hmac); + err = crypto_shash_tfm_digest(c->hmac_tfm, hash, + c->hash_len, hmac); if (err) goto out; @@ -662,9 +648,6 @@ static int authenticate_sleb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, err = 0; } out: - kfree(hash); - kfree(hmac); - return err ? err : n_nodes - n_not_auth; } diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index 675e26989376..8fe03b4a0d2b 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -164,7 +164,7 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc) goto fail_free; } - err = super_setup_bdi_name(sb, "vboxsf-%s.%d", fc->source, sbi->bdi_id); + err = super_setup_bdi_name(sb, "vboxsf-%d", sbi->bdi_id); if (err) goto fail_free; diff --git a/fs/verity/enable.c b/fs/verity/enable.c index d98bea308fd7..5ab3bbec8108 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -329,6 +329,8 @@ rollback: /** * fsverity_ioctl_enable() - enable verity on a file + * @filp: file to enable verity on + * @uarg: user pointer to fsverity_enable_arg * * Enable fs-verity on a file. See the "FS_IOC_ENABLE_VERITY" section of * Documentation/filesystems/fsverity.rst for the documentation. diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 74768cf539da..e96d99d5145e 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -61,7 +61,7 @@ struct merkle_tree_params { u64 level_start[FS_VERITY_MAX_LEVELS]; }; -/** +/* * fsverity_info - cached verity metadata for an inode * * When a verity file is first opened, an instance of this struct is allocated @@ -134,7 +134,7 @@ void __init fsverity_check_hash_algs(void); /* init.c */ -extern void __printf(3, 4) __cold +void __printf(3, 4) __cold fsverity_msg(const struct inode *inode, const char *level, const char *fmt, ...); diff --git a/fs/verity/measure.c b/fs/verity/measure.c index 05049b68c745..df409a5682ed 100644 --- a/fs/verity/measure.c +++ b/fs/verity/measure.c @@ -11,6 +11,8 @@ /** * fsverity_ioctl_measure() - get a verity file's measurement + * @filp: file to get measurement of + * @_uarg: user pointer to fsverity_digest * * Retrieve the file measurement that the kernel is enforcing for reads from a * verity file. See the "FS_IOC_MEASURE_VERITY" section of diff --git a/fs/verity/open.c b/fs/verity/open.c index c5fe6948e262..d007db0c9304 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -330,6 +330,7 @@ EXPORT_SYMBOL_GPL(fsverity_prepare_setattr); /** * fsverity_cleanup_inode() - free the inode's verity info, if present + * @inode: an inode being evicted * * Filesystems must call this on inode eviction to free ->i_verity_info. */ diff --git a/fs/verity/signature.c b/fs/verity/signature.c index c8b255232de5..b14ed96387ec 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -28,6 +28,9 @@ static struct key *fsverity_keyring; /** * fsverity_verify_signature() - check a verity file's signature + * @vi: the file's fsverity_info + * @desc: the file's fsverity_descriptor + * @desc_size: size of @desc * * If the file's fs-verity descriptor includes a signature of the file * measurement, verify it against the certificates in the fs-verity keyring. diff --git a/fs/verity/verify.c b/fs/verity/verify.c index e0cb62da3864..a8b68c6f663d 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -179,6 +179,7 @@ out: /** * fsverity_verify_page() - verify a data page + * @page: the page to verity * * Verify a page that has just been read from a verity file. The page must be a * pagecache page that is still locked and not yet uptodate. @@ -206,6 +207,7 @@ EXPORT_SYMBOL_GPL(fsverity_verify_page); #ifdef CONFIG_BLOCK /** * fsverity_verify_bio() - verify a 'read' bio that has just completed + * @bio: the bio to verify * * Verify a set of pages that have just been read from a verity file. The pages * must be pagecache pages that are still locked and not yet uptodate. Pages @@ -264,6 +266,7 @@ EXPORT_SYMBOL_GPL(fsverity_verify_bio); /** * fsverity_enqueue_verify_work() - enqueue work on the fs-verity workqueue + * @work: the work to enqueue * * Enqueue verification work for asynchronous processing. */ diff --git a/fs/xattr.c b/fs/xattr.c index e13265e65871..91608d9bfc6a 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -876,6 +876,9 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, struct simple_xattr *new_xattr = NULL; int err = 0; + if (removed_size) + *removed_size = -1; + /* value == NULL means remove */ if (value) { new_xattr = simple_xattr_alloc(value, size); @@ -914,9 +917,6 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, list_add(&new_xattr->list, &xattrs->head); xattr = NULL; } - - if (removed_size) - *removed_size = -1; out: spin_unlock(&xattrs->lock); if (xattr) { |