From 68227c03cba84a24faf8a7277d2b1a03c8959c2c Mon Sep 17 00:00:00 2001 From: Mateusz Jurczyk Date: Wed, 7 Jun 2017 12:26:49 +0200 Subject: fuse: initialize the flock flag in fuse_file on allocation Before the patch, the flock flag could remain uninitialized for the lifespan of the fuse_file allocation. Unless set to true in fuse_file_flock(), it would remain in an indeterminate state until read in an if statement in fuse_release_common(). This could consequently lead to taking an unexpected branch in the code. The bug was discovered by a runtime instrumentation designed to detect use of uninitialized memory in the kernel. Signed-off-by: Mateusz Jurczyk Fixes: 37fb3a30b462 ("fuse: fix flock") Cc: # v3.1+ Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3ee4fdc3da9e..76eac2a554c4 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -46,7 +46,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) { struct fuse_file *ff; - ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); + ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL); if (unlikely(!ff)) return NULL; -- cgit From 61c12b49e1c9c77d7a1bcc161de540d0fd21cf0c Mon Sep 17 00:00:00 2001 From: Ashish Samant Date: Wed, 12 Jul 2017 19:26:58 -0700 Subject: fuse: Dont call set_page_dirty_lock() for ITER_BVEC pages for async_dio Commit 8fba54aebbdf ("fuse: direct-io: don't dirty ITER_BVEC pages") fixes the ITER_BVEC page deadlock for direct io in fuse by checking in fuse_direct_io(), whether the page is a bvec page or not, before locking it. However, this check is missed when the "async_dio" mount option is enabled. In this case, set_page_dirty_lock() is called from the req->end callback in request_end(), when the fuse thread is returning from userspace to respond to the read request. This will cause the same deadlock because the bvec condition is not checked in this path. Here is the stack of the deadlocked thread, while returning from userspace: [13706.656686] INFO: task glusterfs:3006 blocked for more than 120 seconds. [13706.657808] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [13706.658788] glusterfs D ffffffff816c80f0 0 3006 1 0x00000080 [13706.658797] ffff8800d6713a58 0000000000000086 ffff8800d9ad7000 ffff8800d9ad5400 [13706.658799] ffff88011ffd5cc0 ffff8800d6710008 ffff88011fd176c0 7fffffffffffffff [13706.658801] 0000000000000002 ffffffff816c80f0 ffff8800d6713a78 ffffffff816c790e [13706.658803] Call Trace: [13706.658809] [] ? bit_wait_io_timeout+0x80/0x80 [13706.658811] [] schedule+0x3e/0x90 [13706.658813] [] schedule_timeout+0x1b5/0x210 [13706.658816] [] ? gup_pud_range+0x1db/0x1f0 [13706.658817] [] ? kvm_clock_read+0x1e/0x20 [13706.658819] [] ? kvm_clock_get_cycles+0x9/0x10 [13706.658822] [] ? ktime_get+0x52/0xc0 [13706.658824] [] io_schedule_timeout+0xa4/0x110 [13706.658826] [] bit_wait_io+0x36/0x50 [13706.658828] [] __wait_on_bit_lock+0x76/0xb0 [13706.658831] [] ? lock_request+0x46/0x70 [fuse] [13706.658834] [] __lock_page+0xaa/0xb0 [13706.658836] [] ? wake_atomic_t_function+0x40/0x40 [13706.658838] [] set_page_dirty_lock+0x58/0x60 [13706.658841] [] fuse_release_user_pages+0x58/0x70 [fuse] [13706.658844] [] ? fuse_aio_complete+0x190/0x190 [fuse] [13706.658847] [] fuse_aio_complete_req+0x29/0x90 [fuse] [13706.658849] [] request_end+0xd9/0x190 [fuse] [13706.658852] [] fuse_dev_do_write+0x336/0x490 [fuse] [13706.658854] [] fuse_dev_write+0x6e/0xa0 [fuse] [13706.658857] [] ? security_file_permission+0x23/0x90 [13706.658859] [] do_iter_readv_writev+0x60/0x90 [13706.658862] [] ? fuse_dev_splice_write+0x350/0x350 [fuse] [13706.658863] [] do_readv_writev+0x171/0x1f0 [13706.658866] [] ? try_to_wake_up+0x210/0x210 [13706.658868] [] vfs_writev+0x41/0x50 [13706.658870] [] SyS_writev+0x56/0xf0 [13706.658872] [] ? syscall_trace_leave+0xf1/0x160 [13706.658874] [] system_call_fastpath+0x12/0x71 Fix this by making should_dirty a fuse_io_priv parameter that can be checked in fuse_aio_complete_req(). Reported-by: Tiger Yang Signed-off-by: Ashish Samant Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 6 +++--- fs/fuse/fuse_i.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 76eac2a554c4..810ed4f99e38 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -609,7 +609,7 @@ static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req) struct fuse_io_priv *io = req->io; ssize_t pos = -1; - fuse_release_user_pages(req, !io->write); + fuse_release_user_pages(req, io->should_dirty); if (io->write) { if (req->misc.write.in.size != req->misc.write.out.size) @@ -1316,7 +1316,6 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, loff_t *ppos, int flags) { int write = flags & FUSE_DIO_WRITE; - bool should_dirty = !write && iter_is_iovec(iter); int cuse = flags & FUSE_DIO_CUSE; struct file *file = io->file; struct inode *inode = file->f_mapping->host; @@ -1346,6 +1345,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, inode_unlock(inode); } + io->should_dirty = !write && iter_is_iovec(iter); while (count) { size_t nres; fl_owner_t owner = current->files; @@ -1360,7 +1360,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, nres = fuse_send_read(req, io, pos, nbytes, owner); if (!io->async) - fuse_release_user_pages(req, should_dirty); + fuse_release_user_pages(req, io->should_dirty); if (req->out.h.error) { err = req->out.h.error; break; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1bd7ffdad593..bd4d2a3e1ec1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -249,6 +249,7 @@ struct fuse_io_priv { size_t size; __u64 offset; bool write; + bool should_dirty; int err; struct kiocb *iocb; struct file *file; -- cgit From 41e327b586762833e48b3703d53312ac32f05f24 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Mon, 7 Aug 2017 21:35:05 +0800 Subject: quota: correct space limit check Currently we compare total space (curspace + rsvspace) with space limit in quota-tools when setting grace time and also in check_bdq(), but we missing rsvspace in somewhere else, correct them. This patch also fix incorrect zero dqb_btime and grace time updating failure when we use rsvspace(e.g. ext4 dalloc feature). Signed-off-by: zhangyi (F) Signed-off-by: Jan Kara --- fs/quota/dquot.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 53a17496c5c5..566e6ef99f07 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1124,6 +1124,10 @@ void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) WARN_ON_ONCE(1); dquot->dq_dqb.dqb_rsvspace = 0; } + if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <= + dquot->dq_dqb.dqb_bsoftlimit) + dquot->dq_dqb.dqb_btime = (time64_t) 0; + clear_bit(DQ_BLKS_B, &dquot->dq_flags); } static void dquot_decr_inodes(struct dquot *dquot, qsize_t number) @@ -1145,7 +1149,8 @@ static void dquot_decr_space(struct dquot *dquot, qsize_t number) dquot->dq_dqb.dqb_curspace -= number; else dquot->dq_dqb.dqb_curspace = 0; - if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit) + if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <= + dquot->dq_dqb.dqb_bsoftlimit) dquot->dq_dqb.dqb_btime = (time64_t) 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } @@ -1381,14 +1386,18 @@ static int info_idq_free(struct dquot *dquot, qsize_t inodes) static int info_bdq_free(struct dquot *dquot, qsize_t space) { + qsize_t tspace; + + tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace; + if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || - dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit) + tspace <= dquot->dq_dqb.dqb_bsoftlimit) return QUOTA_NL_NOWARN; - if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit) + if (tspace - space <= dquot->dq_dqb.dqb_bsoftlimit) return QUOTA_NL_BSOFTBELOW; - if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit && - dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit) + if (tspace >= dquot->dq_dqb.dqb_bhardlimit && + tspace - space < dquot->dq_dqb.dqb_bhardlimit) return QUOTA_NL_BHARDBELOW; return QUOTA_NL_NOWARN; } @@ -2681,7 +2690,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di) if (check_blim) { if (!dm->dqb_bsoftlimit || - dm->dqb_curspace < dm->dqb_bsoftlimit) { + dm->dqb_curspace + dm->dqb_rsvspace < dm->dqb_bsoftlimit) { dm->dqb_btime = 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } else if (!(di->d_fieldmask & QC_SPC_TIMER)) -- cgit From 1feb26162bee7b2f110facfec71b5c7bdbc7d14d Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Tue, 1 Aug 2017 16:25:01 -0400 Subject: nfs/flexfiles: fix leak of nfs4_ff_ds_version arrays The client was freeing the nfs4_ff_layout_ds, but not the contained nfs4_ff_ds_version array. Signed-off-by: Weston Andros Adamson Cc: stable@vger.kernel.org # v4.0+ Signed-off-by: Anna Schumaker --- fs/nfs/flexfilelayout/flexfilelayoutdev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 6df7a0cf5660..f32c58bbe556 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -32,6 +32,7 @@ void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds) { nfs4_print_deviceid(&mirror_ds->id_node.deviceid); nfs4_pnfs_ds_put(mirror_ds->ds); + kfree(mirror_ds->ds_versions); kfree_rcu(mirror_ds, id_node.rcu); } -- cgit From c0ca0e5934b3bd70d456b363a6a989a3b4a692f1 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 8 Aug 2017 21:39:28 -0400 Subject: NFSv4: Ignore NFS4ERR_OLD_STATEID in nfs41_check_open_stateid() If the call to TEST_STATEID returns NFS4ERR_OLD_STATEID, then it just means we raced with other calls to OPEN. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3e46be8c9be1..cbf9cdd48a3b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2553,9 +2553,8 @@ static int nfs41_check_open_stateid(struct nfs4_state *state) clear_bit(NFS_O_RDWR_STATE, &state->flags); clear_bit(NFS_OPEN_STATE, &state->flags); stateid->type = NFS4_INVALID_STATEID_TYPE; - } - if (status != NFS_OK) return status; + } if (nfs_open_stateid_recover_openmode(state)) return -NFS4ERR_OPENMODE; return NFS_OK; -- cgit From d507e2ebd2c7be9138e5cf5c0cb1931c90c42ab1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 10 Aug 2017 15:23:31 -0700 Subject: mm: fix global NR_SLAB_.*CLAIMABLE counter reads As Tetsuo points out: "Commit 385386cff4c6 ("mm: vmstat: move slab statistics from zone to node counters") broke "Slab:" field of /proc/meminfo . It shows nearly 0kB" In addition to /proc/meminfo, this problem also affects the slab counters OOM/allocation failure info dumps, can cause early -ENOMEM from overcommit protection, and miscalculate image size requirements during suspend-to-disk. This is because the patch in question switched the slab counters from the zone level to the node level, but forgot to update the global accessor functions to read the aggregate node data instead of the aggregate zone data. Use global_node_page_state() to access the global slab counters. Fixes: 385386cff4c6 ("mm: vmstat: move slab statistics from zone to node counters") Link: http://lkml.kernel.org/r/20170801134256.5400-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Tetsuo Handa Acked-by: Michal Hocko Cc: Josef Bacik Cc: Vladimir Davydov Cc: Stefan Agner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 8 ++++---- kernel/power/snapshot.c | 2 +- mm/page_alloc.c | 9 +++++---- mm/util.c | 2 +- 4 files changed, 11 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8a428498d6b2..509a61668d90 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -106,13 +106,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_node_page_state(NR_FILE_MAPPED)); show_val_kb(m, "Shmem: ", i.sharedram); show_val_kb(m, "Slab: ", - global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE)); + global_node_page_state(NR_SLAB_RECLAIMABLE) + + global_node_page_state(NR_SLAB_UNRECLAIMABLE)); show_val_kb(m, "SReclaimable: ", - global_page_state(NR_SLAB_RECLAIMABLE)); + global_node_page_state(NR_SLAB_RECLAIMABLE)); show_val_kb(m, "SUnreclaim: ", - global_page_state(NR_SLAB_UNRECLAIMABLE)); + global_node_page_state(NR_SLAB_UNRECLAIMABLE)); seq_printf(m, "KernelStack: %8lu kB\n", global_page_state(NR_KERNEL_STACK_KB)); show_val_kb(m, "PageTables: ", diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 222317721c5a..0972a8e09d08 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1650,7 +1650,7 @@ static unsigned long minimum_image_size(unsigned long saveable) { unsigned long size; - size = global_page_state(NR_SLAB_RECLAIMABLE) + size = global_node_page_state(NR_SLAB_RECLAIMABLE) + global_node_page_state(NR_ACTIVE_ANON) + global_node_page_state(NR_INACTIVE_ANON) + global_node_page_state(NR_ACTIVE_FILE) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fc32aa81f359..626a430e32d1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4458,8 +4458,9 @@ long si_mem_available(void) * Part of the reclaimable slab consists of items that are in use, * and cannot be freed. Cap this estimate at the low watermark. */ - available += global_page_state(NR_SLAB_RECLAIMABLE) - - min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + available += global_node_page_state(NR_SLAB_RECLAIMABLE) - + min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, + wmark_low); if (available < 0) available = 0; @@ -4602,8 +4603,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_FILE_DIRTY), global_node_page_state(NR_WRITEBACK), global_node_page_state(NR_UNSTABLE_NFS), - global_page_state(NR_SLAB_RECLAIMABLE), - global_page_state(NR_SLAB_UNRECLAIMABLE), + global_node_page_state(NR_SLAB_RECLAIMABLE), + global_node_page_state(NR_SLAB_UNRECLAIMABLE), global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), diff --git a/mm/util.c b/mm/util.c index 7b07ec852e01..9ecddf568fe3 100644 --- a/mm/util.c +++ b/mm/util.c @@ -633,7 +633,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) * which are reclaimable, under pressure. The dentry * cache and most inode caches should fall into this */ - free += global_page_state(NR_SLAB_RECLAIMABLE); + free += global_node_page_state(NR_SLAB_RECLAIMABLE); /* * Leave reserved pages. The pages are not for anonymous pages. -- cgit From b3a81d0841a954a3d3e782059960f53e63b37066 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 10 Aug 2017 15:24:15 -0700 Subject: mm: fix KSM data corruption Nadav reported KSM can corrupt the user data by the TLB batching race[1]. That means data user written can be lost. Quote from Nadav Amit: "For this race we need 4 CPUs: CPU0: Caches a writable and dirty PTE entry, and uses the stale value for write later. CPU1: Runs madvise_free on the range that includes the PTE. It would clear the dirty-bit. It batches TLB flushes. CPU2: Writes 4 to /proc/PID/clear_refs , clearing the PTEs soft-dirty. We care about the fact that it clears the PTE write-bit, and of course, batches TLB flushes. CPU3: Runs KSM. Our purpose is to pass the following test in write_protect_page(): if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte))) Since it will avoid TLB flush. And we want to do it while the PTE is stale. Later, and before replacing the page, we would be able to change the page. Note that all the operations the CPU1-3 perform canhappen in parallel since they only acquire mmap_sem for read. We start with two identical pages. Everything below regards the same page/PTE. CPU0 CPU1 CPU2 CPU3 ---- ---- ---- ---- Write the same value on page [cache PTE as dirty in TLB] MADV_FREE pte_mkclean() 4 > clear_refs pte_wrprotect() write_protect_page() [ success, no flush ] pages_indentical() [ ok ] Write to page different value [Ok, using stale PTE] replace_page() Later, CPU1, CPU2 and CPU3 would flush the TLB, but that is too late. CPU0 already wrote on the page, but KSM ignored this write, and it got lost" In above scenario, MADV_FREE is fixed by changing TLB batching API including [set|clear]_tlb_flush_pending. Remained thing is soft-dirty part. This patch changes soft-dirty uses TLB batching API instead of flush_tlb_mm and KSM checks pending TLB flush by using mm_tlb_flush_pending so that it will flush TLB to avoid data lost if there are other parallel threads pending TLB flush. [1] http://lkml.kernel.org/r/BD3A0EBE-ECF4-41D4-87FA-C755EA9AB6BD@gmail.com Link: http://lkml.kernel.org/r/20170802000818.4760-8-namit@vmware.com Signed-off-by: Minchan Kim Signed-off-by: Nadav Amit Reported-by: Nadav Amit Tested-by: Nadav Amit Reviewed-by: Andrea Arcangeli Cc: Mel Gorman Cc: Hugh Dickins Cc: "David S. Miller" Cc: Andy Lutomirski Cc: Heiko Carstens Cc: Ingo Molnar Cc: Jeff Dike Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Nadav Amit Cc: Rik van Riel Cc: Russell King Cc: Sergey Senozhatsky Cc: Tony Luck Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 7 +++++-- mm/ksm.c | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b836fd61ed87..fe8f3265e877 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -16,9 +16,10 @@ #include #include #include +#include #include -#include +#include #include #include "internal.h" @@ -1008,6 +1009,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, struct mm_struct *mm; struct vm_area_struct *vma; enum clear_refs_types type; + struct mmu_gather tlb; int itype; int rv; @@ -1054,6 +1056,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } down_read(&mm->mmap_sem); + tlb_gather_mmu(&tlb, mm, 0, -1); if (type == CLEAR_REFS_SOFT_DIRTY) { for (vma = mm->mmap; vma; vma = vma->vm_next) { if (!(vma->vm_flags & VM_SOFTDIRTY)) @@ -1075,7 +1078,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); if (type == CLEAR_REFS_SOFT_DIRTY) mmu_notifier_invalidate_range_end(mm, 0, -1); - flush_tlb_mm(mm); + tlb_finish_mmu(&tlb, 0, -1); up_read(&mm->mmap_sem); out_mm: mmput(mm); diff --git a/mm/ksm.c b/mm/ksm.c index 4dc92f138786..db20f8436bc3 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1038,7 +1038,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, goto out_unlock; if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || - (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte))) { + (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || + mm_tlb_flush_pending(mm)) { pte_t entry; swapped = PageSwapCache(page); -- cgit From e86b298bebf7e799e4b7232e9135799f1947552e Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 10 Aug 2017 15:24:32 -0700 Subject: userfaultfd: replace ENOSPC with ESRCH in case mm has gone during copy/zeropage When the process exit races with outstanding mcopy_atomic, it would be better to return ESRCH error. When such race occurs the process and it's mm are going away and returning "no such process" to the uffd monitor seems better fit than ENOSPC. Link: http://lkml.kernel.org/r/1502111545-32305-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Andrea Arcangeli Cc: "Dr. David Alan Gilbert" Cc: Pavel Emelyanov Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 06ea26b8c996..b0d5897bc4e6 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1600,7 +1600,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, uffdio_copy.len); mmput(ctx->mm); } else { - return -ENOSPC; + return -ESRCH; } if (unlikely(put_user(ret, &user_uffdio_copy->copy))) return -EFAULT; @@ -1647,7 +1647,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, uffdio_zeropage.range.len); mmput(ctx->mm); } else { - return -ENOSPC; + return -ESRCH; } if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) return -EFAULT; -- cgit From 9183976ef1c858c289b09066fd57aae51b86653c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 25 May 2017 06:57:50 -0400 Subject: fuse: set mapping error in writepage_locked when it fails This ensures that we see errors on fsync when writeback fails. Signed-off-by: Jeff Layton Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 810ed4f99e38..ab60051be6e5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1669,6 +1669,7 @@ err_nofile: err_free: fuse_request_free(req); err: + mapping_set_error(page->mapping, error); end_page_writeback(page); return error; } -- cgit From 8a9d6e964d318533ba3d2901ce153ba317c99a89 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 5 Aug 2017 10:59:14 +0200 Subject: pnfs/blocklayout: require 64-bit sector_t The blocklayout code does not compile cleanly for a 32-bit sector_t, and also has no reliable checks for devices sizes, which makes it unsafe to use with a kernel that doesn't support large block devices. Signed-off-by: Christoph Hellwig Reported-by: Arnd Bergmann Fixes: 5c83746a0cf2 ("pnfs/blocklayout: in-kernel GETDEVICEINFO XDR parsing") Signed-off-by: Anna Schumaker --- fs/nfs/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 69d02cf8cf37..5f93cfacb3d1 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -121,6 +121,7 @@ config PNFS_FILE_LAYOUT config PNFS_BLOCK tristate depends on NFS_V4_1 && BLK_DEV_DM + depends on 64BIT || LBDAF default NFS_V4 config PNFS_FLEXFILE_LAYOUT -- cgit From c44245b3d5435f533ca8346ece65918f84c057f9 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 11 Aug 2017 09:00:06 -0700 Subject: xfs: fix inobt inode allocation search optimization When we try to allocate a free inode by searching the inobt, we try to find the inode nearest the parent inode by searching chunks both left and right of the chunk containing the parent. As an optimization, we cache the leftmost and rightmost records that we previously searched; if we do another allocation with the same parent inode, we'll pick up the search where it last left off. There's a bug in the case where we found a free inode to the left of the parent's chunk: we need to update the cached left and right records, but because we already reassigned the right record to point to the left, we end up assigning the left record to both the cached left and right records. This isn't a correctness problem strictly, but it can result in the next allocation rechecking chunks unnecessarily or allocating inodes further away from the parent than it needs to. Fix it by swapping the record pointer after we update the cached left and right records. Fixes: bd169565993b ("xfs: speed up free inode search") Signed-off-by: Omar Sandoval Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_ialloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index ffd5a15d1bb6..abf5beaae907 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1246,13 +1246,13 @@ xfs_dialloc_ag_inobt( /* free inodes to the left? */ if (useleft && trec.ir_freecount) { - rec = trec; xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); cur = tcur; pag->pagl_leftrec = trec.ir_startino; pag->pagl_rightrec = rec.ir_startino; pag->pagl_pagino = pagino; + rec = trec; goto alloc_inode; } -- cgit From e28ae8e428fefe2facd72cea9f29906ecb9c861d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Aug 2017 12:45:35 -0700 Subject: iomap: fix integer truncation issues in the zeroing and dirtying helpers Fix the min_t calls in the zeroing and dirtying helpers to perform the comparisms on 64-bit types, which prevents them from incorrectly being truncated, and larger zeroing operations being stuck in a never ending loop. Special thanks to Markus Stockhausen for spotting the bug. Reported-by: Paul Menzel Tested-by: Paul Menzel Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/iomap.c b/fs/iomap.c index 039266128b7f..59cc98ad7577 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -278,7 +278,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, unsigned long bytes; /* Bytes to write to page */ offset = (pos & (PAGE_SIZE - 1)); - bytes = min_t(unsigned long, PAGE_SIZE - offset, length); + bytes = min_t(loff_t, PAGE_SIZE - offset, length); rpage = __iomap_read_page(inode, pos); if (IS_ERR(rpage)) @@ -373,7 +373,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, unsigned offset, bytes; offset = pos & (PAGE_SIZE - 1); /* Within page */ - bytes = min_t(unsigned, PAGE_SIZE - offset, count); + bytes = min_t(loff_t, PAGE_SIZE - offset, count); if (IS_DAX(inode)) status = iomap_dax_zero(pos, offset, bytes, iomap); -- cgit From b80b32b6d5e79798b85cd4644206aaa069059390 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 14 Aug 2017 08:29:18 -0400 Subject: ext4: fix clang build regression Arnd Bergmann As Stefan pointed out, I misremembered what clang can do specifically, and it turns out that the variable-length array at the end of the structure did not work (a flexible array would have worked here but not solved the problem): fs/ext4/mballoc.c:2303:17: error: fields must have a constant size: 'variable length array in structure' extension will never be supported ext4_grpblk_t counters[blocksize_bits + 2]; This reverts part of my previous patch, using a fixed-size array again, but keeping the check for the array overflow. Fixes: 2df2c3402fc8 ("ext4: fix warning about stack corruption") Reported-by: Stefan Agner Tested-by: Chandan Rajendra Signed-off-by: Arnd Bergmann Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5a1052627a81..701085620cd8 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2300,7 +2300,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) EXT4_MAX_BLOCK_LOG_SIZE); struct sg { struct ext4_group_info info; - ext4_grpblk_t counters[blocksize_bits + 2]; + ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; } sg; group--; @@ -2309,6 +2309,9 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); + i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + sizeof(struct ext4_group_info); + grinfo = ext4_get_group_info(sb, group); /* Load the group info in memory only if not already loaded. */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { @@ -2320,7 +2323,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) buddy_loaded = 1; } - memcpy(&sg, ext4_get_group_info(sb, group), sizeof(sg)); + memcpy(&sg, ext4_get_group_info(sb, group), i); if (buddy_loaded) ext4_mb_unload_buddy(&e4b); -- cgit From 32aaf194201e98db4235b7b71ac62a22e2ac355f Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Mon, 14 Aug 2017 08:30:06 -0400 Subject: ext4: add missing xattr hash update When updating an extended attribute, if the padded value sizes are the same, a shortcut is taken to avoid the bulk of the work. This was fine until the xattr hash update was moved inside ext4_xattr_set_entry(). With that change, the hash update got missed in the shortcut case. Thanks to ZhangYi (yizhang089@gmail.com) for root causing the problem. Fixes: daf8328172df ("ext4: eliminate xattr entry e_hash recalculation for removes") Reported-by: Miklos Szeredi Signed-off-by: Tahsin Erdogan Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 82a5af9f6668..3dd970168448 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1543,7 +1543,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, /* Clear padding bytes. */ memset(val + i->value_len, 0, new_size - i->value_len); } - return 0; + goto update_hash; } /* Compute min_offs and last. */ @@ -1707,6 +1707,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, here->e_value_size = cpu_to_le32(i->value_len); } +update_hash: if (i->value) { __le32 hash = 0; @@ -1725,7 +1726,8 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, here->e_name_len, &crc32c_hash, 1); } else if (is_block) { - __le32 *value = s->base + min_offs - new_size; + __le32 *value = s->base + le16_to_cpu( + here->e_value_offs); hash = ext4_xattr_hash_entry(here->e_name, here->e_name_len, value, -- cgit From 01578e36163cdd0e4fd61d9976de15f13364e26d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 15 Aug 2017 17:40:11 +0200 Subject: x86/elf: Remove the unnecessary ADDR_NO_RANDOMIZE checks The ADDR_NO_RANDOMIZE checks in stack_maxrandom_size() and randomize_stack_top() are not required. PF_RANDOMIZE is set by load_elf_binary() only if ADDR_NO_RANDOMIZE is not set, no need to re-check after that. Signed-off-by: Oleg Nesterov Signed-off-by: Thomas Gleixner Reviewed-by: Dmitry Safonov Cc: stable@vger.kernel.org Cc: Andy Lutomirski Cc: Andrew Morton Cc: Borislav Petkov Cc: Linus Torvalds Cc: "Kirill A. Shutemov" Link: http://lkml.kernel.org/r/20170815154011.GB1076@redhat.com --- arch/x86/mm/mmap.c | 3 +-- fs/binfmt_elf.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index c94df122815a..a88cfbfbd078 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -50,8 +50,7 @@ unsigned long tasksize_64bit(void) static unsigned long stack_maxrandom_size(unsigned long task_size) { unsigned long max = 0; - if ((current->flags & PF_RANDOMIZE) && - !(current->personality & ADDR_NO_RANDOMIZE)) { + if (current->flags & PF_RANDOMIZE) { max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit()); max <<= PAGE_SHIFT; } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 879ff9c7ffd0..6466153f2bf0 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -664,8 +664,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top) { unsigned long random_variable = 0; - if ((current->flags & PF_RANDOMIZE) && - !(current->personality & ADDR_NO_RANDOMIZE)) { + if (current->flags & PF_RANDOMIZE) { random_variable = get_random_long(); random_variable &= STACK_RND_MASK; random_variable <<= PAGE_SHIFT; -- cgit From c8c03f1858331e85d397bacccd34ef409aae993c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 16 Aug 2017 17:08:07 -0700 Subject: pty: fix the cached path of the pty slave file descriptor in the master Christian Brauner reported that if you use the TIOCGPTPEER ioctl() to get a slave pty file descriptor, the resulting file descriptor doesn't look right in /proc//fd/. In particular, he wanted to use readlink() on /proc/self/fd/ to get the pathname of the slave pty (basically implementing "ptsname{_r}()"). The reason for that was that we had generated the wrong 'struct path' when we create the pty in ptmx_open(). In particular, the dentry was correct, but the vfsmount pointed to the mount of the ptmx node. That _can_ be correct - in case you use "/dev/pts/ptmx" to open the master - but usually is not. The normal case is to use /dev/ptmx, which then looks up the pts/ directory, and then the vfsmount of the ptmx node is obviously the /dev directory, not the /dev/pts/ directory. We actually did have the right vfsmount available, but in the wrong place (it gets looked up in 'devpts_acquire()' when we get a reference to the pts filesystem), and so ptmx_open() used the wrong mnt pointer. The end result of this confusion was that the pty worked fine, but when if you did TIOCGPTPEER to get the slave side of the pty, end end result would also work, but have that dodgy 'struct path'. And then when doing "d_path()" on to get the pathname, the vfsmount would not match the root of the pts directory, and d_path() would return an empty pathname thinking that the entry had escaped a bind mount into another mount. This fixes the problem by making devpts_acquire() return the vfsmount for the pts filesystem, allowing ptmx_open() to trivially just use the right mount for the pts dentry, and create the proper 'struct path'. Reported-by: Christian Brauner Cc: Al Viro Acked-by: Eric Biederman Signed-off-by: Linus Torvalds --- drivers/tty/pty.c | 7 +++++-- fs/devpts/inode.c | 4 +++- include/linux/devpts_fs.h | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index 284749fb0f6b..1fc80ea87c13 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -793,6 +793,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) struct tty_struct *tty; struct path *pts_path; struct dentry *dentry; + struct vfsmount *mnt; int retval; int index; @@ -805,7 +806,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) if (retval) return retval; - fsi = devpts_acquire(filp); + fsi = devpts_acquire(filp, &mnt); if (IS_ERR(fsi)) { retval = PTR_ERR(fsi); goto out_free_file; @@ -849,7 +850,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) pts_path = kmalloc(sizeof(struct path), GFP_KERNEL); if (!pts_path) goto err_release; - pts_path->mnt = filp->f_path.mnt; + pts_path->mnt = mnt; pts_path->dentry = dentry; path_get(pts_path); tty->link->driver_data = pts_path; @@ -866,6 +867,7 @@ err_path_put: path_put(pts_path); kfree(pts_path); err_release: + mntput(mnt); tty_unlock(tty); // This will also put-ref the fsi tty_release(inode, filp); @@ -874,6 +876,7 @@ out: devpts_kill_index(fsi, index); out_put_fsi: devpts_release(fsi); + mntput(mnt); out_free_file: tty_free_file(filp); return retval; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 108df2e3602c..44dfbca9306f 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -133,7 +133,7 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb) return sb->s_fs_info; } -struct pts_fs_info *devpts_acquire(struct file *filp) +struct pts_fs_info *devpts_acquire(struct file *filp, struct vfsmount **ptsmnt) { struct pts_fs_info *result; struct path path; @@ -142,6 +142,7 @@ struct pts_fs_info *devpts_acquire(struct file *filp) path = filp->f_path; path_get(&path); + *ptsmnt = NULL; /* Has the devpts filesystem already been found? */ sb = path.mnt->mnt_sb; @@ -165,6 +166,7 @@ struct pts_fs_info *devpts_acquire(struct file *filp) * pty code needs to hold extra references in case of last /dev/tty close */ atomic_inc(&sb->s_active); + *ptsmnt = mntget(path.mnt); result = DEVPTS_SB(sb); out: diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h index 277ab9af9ac2..7883e901f65c 100644 --- a/include/linux/devpts_fs.h +++ b/include/linux/devpts_fs.h @@ -19,7 +19,7 @@ struct pts_fs_info; -struct pts_fs_info *devpts_acquire(struct file *); +struct pts_fs_info *devpts_acquire(struct file *, struct vfsmount **ptsmnt); void devpts_release(struct pts_fs_info *); int devpts_new_index(struct pts_fs_info *); -- cgit From 8204f8ddaafafcae074746fcf2a05a45e6827603 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 10 Aug 2017 14:20:28 -0700 Subject: xfs: clear MS_ACTIVE after finishing log recovery Way back when we established inode block-map redo log items, it was discovered that we needed to prevent the VFS from evicting inodes during log recovery because any given inode might be have bmap redo items to replay even if the inode has no link count and is ultimately deleted, and any eviction of an unlinked inode causes the inode to be truncated and freed too early. To make this possible, we set MS_ACTIVE so that inodes would not be torn down immediately upon release. Unfortunately, this also results in the quota inodes not being released at all if a later part of the mount process should fail, because we never reclaim the inodes. So, set MS_ACTIVE right before we do the last part of log recovery and clear it immediately after we finish the log recovery so that everything will be torn down properly if we abort the mount. Fixes: 17c12bcd30 ("xfs: when replaying bmap operations, don't let unlinked inodes get reaped") Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_log.c | 11 +++++++++++ fs/xfs/xfs_mount.c | 10 ---------- 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 0053bcf2b10a..4ebd0bafc914 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -749,9 +749,20 @@ xfs_log_mount_finish( return 0; } + /* + * During the second phase of log recovery, we need iget and + * iput to behave like they do for an active filesystem. + * xfs_fs_drop_inode needs to be able to prevent the deletion + * of inodes before we're done replaying log items on those + * inodes. Turn it off immediately after recovery finishes + * so that we don't leak the quota inodes if subsequent mount + * activities fail. + */ + mp->m_super->s_flags |= MS_ACTIVE; error = xlog_recover_finish(mp->m_log); if (!error) xfs_log_work_queue(mp); + mp->m_super->s_flags &= ~MS_ACTIVE; return error; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 40d4e8b4e193..151a82db0945 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -944,15 +944,6 @@ xfs_mountfs( } } - /* - * During the second phase of log recovery, we need iget and - * iput to behave like they do for an active filesystem. - * xfs_fs_drop_inode needs to be able to prevent the deletion - * of inodes before we're done replaying log items on those - * inodes. - */ - mp->m_super->s_flags |= MS_ACTIVE; - /* * Finish recovering the file system. This part needed to be delayed * until after the root and real-time bitmap inodes were consistently @@ -1028,7 +1019,6 @@ xfs_mountfs( out_quota: xfs_qm_unmount_quotas(mp); out_rtunmount: - mp->m_super->s_flags &= ~MS_ACTIVE; xfs_rtunmount_inodes(mp); out_rele_rip: IRELE(rip); -- cgit From 77aff8c76425c8f49b50d0b9009915066739e7d2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 10 Aug 2017 14:20:29 -0700 Subject: xfs: don't leak quotacheck dquots when cow recovery If we fail a mount on account of cow recovery errors, it's possible that a previous quotacheck left some dquots in memory. The bailout clause of xfs_mountfs forgets to purge these, and so we leak them. Fix that. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/xfs_mount.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 151a82db0945..ea7d4b4e50d0 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1024,6 +1024,8 @@ xfs_mountfs( IRELE(rip); cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp, SYNC_WAIT); + /* Clean out dquots that might be in memory after quotacheck. */ + xfs_qm_unmount(mp); out_log_dealloc: mp->m_flags |= XFS_MOUNT_UNMOUNTING; xfs_log_mount_cancel(mp); -- cgit From 143c97cc652949893c8056c679012f0aeccb80e5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 23 Aug 2017 18:16:11 -0700 Subject: Revert "pty: fix the cached path of the pty slave file descriptor in the master" This reverts commit c8c03f1858331e85d397bacccd34ef409aae993c. It turns out that while fixing the ptmx file descriptor to have the correct 'struct path' to the associated slave pty is a really good thing, it breaks some user space tools for a very annoying reason. The problem is that /dev/ptmx and its associated slave pty (/dev/pts/X) are on different mounts. That was what caused us to have the wrong path in the first place (we would mix up the vfsmount of the 'ptmx' node, with the dentry of the pty slave node), but it also means that now while we use the right vfsmount, having the pty master open also keeps the pts mount busy. And it turn sout that that makes 'pbuilder' very unhappy, as noted by Stefan Lippers-Hollmann: "This patch introduces a regression for me when using pbuilder 0.228.7[2] (a helper to build Debian packages in a chroot and to create and update its chroots) when trying to umount /dev/ptmx (inside the chroot) on Debian/ unstable (full log and pbuilder configuration file[3] attached). [...] Setting up build-essential (12.3) ... Processing triggers for libc-bin (2.24-15) ... I: unmounting dev/ptmx filesystem W: Could not unmount dev/ptmx: umount: /var/cache/pbuilder/build/1340/dev/ptmx: target is busy (In some cases useful info about processes that use the device is found by lsof(8) or fuser(1).)" apparently pbuilder tries to unmount the /dev/pts filesystem while still holding at least one master node open, which is arguably not very nice, but we don't break user space even when fixing other bugs. So this commit has to be reverted. I'll try to figure out a way to avoid caching the path to the slave pty in the master pty. The only thing that actually wants that slave pty path is the "TIOCGPTPEER" ioctl, and I think we could just recreate the path at that time. Reported-by: Stefan Lippers-Hollmann Cc: Eric W Biederman Cc: Christian Brauner Cc: Al Viro Signed-off-by: Linus Torvalds --- drivers/tty/pty.c | 7 ++----- fs/devpts/inode.c | 4 +--- include/linux/devpts_fs.h | 2 +- 3 files changed, 4 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index 1fc80ea87c13..284749fb0f6b 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -793,7 +793,6 @@ static int ptmx_open(struct inode *inode, struct file *filp) struct tty_struct *tty; struct path *pts_path; struct dentry *dentry; - struct vfsmount *mnt; int retval; int index; @@ -806,7 +805,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) if (retval) return retval; - fsi = devpts_acquire(filp, &mnt); + fsi = devpts_acquire(filp); if (IS_ERR(fsi)) { retval = PTR_ERR(fsi); goto out_free_file; @@ -850,7 +849,7 @@ static int ptmx_open(struct inode *inode, struct file *filp) pts_path = kmalloc(sizeof(struct path), GFP_KERNEL); if (!pts_path) goto err_release; - pts_path->mnt = mnt; + pts_path->mnt = filp->f_path.mnt; pts_path->dentry = dentry; path_get(pts_path); tty->link->driver_data = pts_path; @@ -867,7 +866,6 @@ err_path_put: path_put(pts_path); kfree(pts_path); err_release: - mntput(mnt); tty_unlock(tty); // This will also put-ref the fsi tty_release(inode, filp); @@ -876,7 +874,6 @@ out: devpts_kill_index(fsi, index); out_put_fsi: devpts_release(fsi); - mntput(mnt); out_free_file: tty_free_file(filp); return retval; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 44dfbca9306f..108df2e3602c 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -133,7 +133,7 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb) return sb->s_fs_info; } -struct pts_fs_info *devpts_acquire(struct file *filp, struct vfsmount **ptsmnt) +struct pts_fs_info *devpts_acquire(struct file *filp) { struct pts_fs_info *result; struct path path; @@ -142,7 +142,6 @@ struct pts_fs_info *devpts_acquire(struct file *filp, struct vfsmount **ptsmnt) path = filp->f_path; path_get(&path); - *ptsmnt = NULL; /* Has the devpts filesystem already been found? */ sb = path.mnt->mnt_sb; @@ -166,7 +165,6 @@ struct pts_fs_info *devpts_acquire(struct file *filp, struct vfsmount **ptsmnt) * pty code needs to hold extra references in case of last /dev/tty close */ atomic_inc(&sb->s_active); - *ptsmnt = mntget(path.mnt); result = DEVPTS_SB(sb); out: diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h index 7883e901f65c..277ab9af9ac2 100644 --- a/include/linux/devpts_fs.h +++ b/include/linux/devpts_fs.h @@ -19,7 +19,7 @@ struct pts_fs_info; -struct pts_fs_info *devpts_acquire(struct file *, struct vfsmount **ptsmnt); +struct pts_fs_info *devpts_acquire(struct file *); void devpts_release(struct pts_fs_info *); int devpts_new_index(struct pts_fs_info *); -- cgit From 58efbc9f5463308c43d812fd0748a4dee44c8a16 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 22 Aug 2017 23:45:59 -0700 Subject: Btrfs: fix blk_status_t/errno confusion This fixes several instances of blk_status_t and bare errno ints being mixed up, some of which are real bugs. In the normal case, 0 matches BLK_STS_OK, so we don't observe any effects of the missing conversion, but in case of errors or passes through the repair/retry paths, the errors get mixed up. The changes were identified using 'sparse', we don't have reports of the buggy behaviour. Fixes: 4e4cbee93d56 ("block: switch bios to blk_status_t") Signed-off-by: Omar Sandoval Reviewed-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/inode.c | 70 +++++++++++++++++++++++++++++------------------------- fs/btrfs/raid56.c | 34 +++++++++++++------------- fs/btrfs/volumes.c | 10 ++++---- fs/btrfs/volumes.h | 6 ++--- 5 files changed, 64 insertions(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 080e2ebb8aa0..f45b61fe9a9a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3516,7 +3516,7 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device) struct bio *bio = device->flush_bio; if (!device->flush_bio_sent) - return 0; + return BLK_STS_OK; device->flush_bio_sent = 0; wait_for_completion_io(&device->flush_wait); @@ -3563,7 +3563,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info) continue; write_dev_flush(dev); - dev->last_flush_error = 0; + dev->last_flush_error = BLK_STS_OK; } /* wait for all the barriers */ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 95c212037095..24bcd5cd9cf2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7924,11 +7924,12 @@ err: return ret; } -static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio, - int mirror_num) +static inline blk_status_t submit_dio_repair_bio(struct inode *inode, + struct bio *bio, + int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int ret; + blk_status_t ret; BUG_ON(bio_op(bio) == REQ_OP_WRITE); @@ -7980,10 +7981,10 @@ static int btrfs_check_dio_repairable(struct inode *inode, return 1; } -static int dio_read_error(struct inode *inode, struct bio *failed_bio, - struct page *page, unsigned int pgoff, - u64 start, u64 end, int failed_mirror, - bio_end_io_t *repair_endio, void *repair_arg) +static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio, + struct page *page, unsigned int pgoff, + u64 start, u64 end, int failed_mirror, + bio_end_io_t *repair_endio, void *repair_arg) { struct io_failure_record *failrec; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -7993,18 +7994,19 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, int read_mode = 0; int segs; int ret; + blk_status_t status; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); ret = btrfs_get_io_failure_record(inode, start, end, &failrec); if (ret) - return ret; + return errno_to_blk_status(ret); ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, failed_mirror); if (!ret) { free_io_failure(failure_tree, io_tree, failrec); - return -EIO; + return BLK_STS_IOERR; } segs = bio_segments(failed_bio); @@ -8022,13 +8024,13 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n", read_mode, failrec->this_mirror, failrec->in_validation); - ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror); - if (ret) { + status = submit_dio_repair_bio(inode, bio, failrec->this_mirror); + if (status) { free_io_failure(failure_tree, io_tree, failrec); bio_put(bio); } - return ret; + return status; } struct btrfs_retry_complete { @@ -8065,8 +8067,8 @@ end: bio_put(bio); } -static int __btrfs_correct_data_nocsum(struct inode *inode, - struct btrfs_io_bio *io_bio) +static blk_status_t __btrfs_correct_data_nocsum(struct inode *inode, + struct btrfs_io_bio *io_bio) { struct btrfs_fs_info *fs_info; struct bio_vec bvec; @@ -8076,8 +8078,8 @@ static int __btrfs_correct_data_nocsum(struct inode *inode, unsigned int pgoff; u32 sectorsize; int nr_sectors; - int ret; - int err = 0; + blk_status_t ret; + blk_status_t err = BLK_STS_OK; fs_info = BTRFS_I(inode)->root->fs_info; sectorsize = fs_info->sectorsize; @@ -8183,11 +8185,12 @@ static blk_status_t __btrfs_subio_endio_read(struct inode *inode, int csum_pos; bool uptodate = (err == 0); int ret; + blk_status_t status; fs_info = BTRFS_I(inode)->root->fs_info; sectorsize = fs_info->sectorsize; - err = 0; + err = BLK_STS_OK; start = io_bio->logical; done.inode = inode; io_bio->bio.bi_iter = io_bio->iter; @@ -8209,12 +8212,12 @@ try_again: done.start = start; init_completion(&done.done); - ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page, - pgoff, start, start + sectorsize - 1, - io_bio->mirror_num, - btrfs_retry_endio, &done); - if (ret) { - err = errno_to_blk_status(ret); + status = dio_read_error(inode, &io_bio->bio, bvec.bv_page, + pgoff, start, start + sectorsize - 1, + io_bio->mirror_num, btrfs_retry_endio, + &done); + if (status) { + err = status; goto next; } @@ -8250,7 +8253,7 @@ static blk_status_t btrfs_subio_endio_read(struct inode *inode, if (unlikely(err)) return __btrfs_correct_data_nocsum(inode, io_bio); else - return 0; + return BLK_STS_OK; } else { return __btrfs_subio_endio_read(inode, io_bio, err); } @@ -8423,9 +8426,9 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, return 0; } -static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, - u64 file_offset, int skip_sum, - int async_submit) +static inline blk_status_t +__btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset, + int skip_sum, int async_submit) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; @@ -8488,6 +8491,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, int clone_offset = 0; int clone_len; int ret; + blk_status_t status; map_length = orig_bio->bi_iter.bi_size; submit_len = map_length; @@ -8537,9 +8541,9 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, */ atomic_inc(&dip->pending_bios); - ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, - async_submit); - if (ret) { + status = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, + async_submit); + if (status) { bio_put(bio); atomic_dec(&dip->pending_bios); goto out_err; @@ -8557,9 +8561,9 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, } while (submit_len > 0); submit: - ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, - async_submit); - if (!ret) + status = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, + async_submit); + if (!status) return 0; bio_put(bio); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 208638384cd2..2cf6ba40f7c4 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -905,7 +905,7 @@ static void raid_write_end_io(struct bio *bio) if (!atomic_dec_and_test(&rbio->stripes_pending)) return; - err = 0; + err = BLK_STS_OK; /* OK, we have read all the stripes we need to. */ max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? @@ -1324,7 +1324,7 @@ write_data: return; cleanup: - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); } /* @@ -1475,7 +1475,7 @@ static void raid_rmw_end_io(struct bio *bio) cleanup: - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); } static void async_rmw_stripe(struct btrfs_raid_bio *rbio) @@ -1579,7 +1579,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) return 0; cleanup: - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); return -EIO; finish: @@ -1795,12 +1795,12 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) void **pointers; int faila = -1, failb = -1; struct page *page; - int err; + blk_status_t err; int i; pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!pointers) { - err = -ENOMEM; + err = BLK_STS_RESOURCE; goto cleanup_io; } @@ -1856,7 +1856,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) * a bad data or Q stripe. * TODO, we should redo the xor here. */ - err = -EIO; + err = BLK_STS_IOERR; goto cleanup; } /* @@ -1882,7 +1882,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { if (rbio->bbio->raid_map[faila] == RAID5_P_STRIPE) { - err = -EIO; + err = BLK_STS_IOERR; goto cleanup; } /* @@ -1954,13 +1954,13 @@ pstripe: } } - err = 0; + err = BLK_STS_OK; cleanup: kfree(pointers); cleanup_io: if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { - if (err == 0) + if (err == BLK_STS_OK) cache_rbio_pages(rbio); else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); @@ -1968,7 +1968,7 @@ cleanup_io: rbio_orig_end_io(rbio, err); } else if (rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { rbio_orig_end_io(rbio, err); - } else if (err == 0) { + } else if (err == BLK_STS_OK) { rbio->faila = -1; rbio->failb = -1; @@ -2005,7 +2005,7 @@ static void raid_recover_end_io(struct bio *bio) return; if (atomic_read(&rbio->error) > rbio->bbio->max_errors) - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); else __raid_recover_end_io(rbio); } @@ -2104,7 +2104,7 @@ out: cleanup: if (rbio->operation == BTRFS_RBIO_READ_REBUILD || rbio->operation == BTRFS_RBIO_REBUILD_MISSING) - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); return -EIO; } @@ -2431,7 +2431,7 @@ submit_write: nr_data = bio_list_size(&bio_list); if (!nr_data) { /* Every parity is right */ - rbio_orig_end_io(rbio, 0); + rbio_orig_end_io(rbio, BLK_STS_OK); return; } @@ -2451,7 +2451,7 @@ submit_write: return; cleanup: - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); } static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) @@ -2519,7 +2519,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) return; cleanup: - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); } /* @@ -2633,7 +2633,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) return; cleanup: - rbio_orig_end_io(rbio, -EIO); + rbio_orig_end_io(rbio, BLK_STS_IOERR); return; finish: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e8b9a269fdde..bd679bc7a1a9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6212,8 +6212,8 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) } } -int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num, int async_submit) +blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num, int async_submit) { struct btrfs_device *dev; struct bio *first_bio = bio; @@ -6233,7 +6233,7 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, &map_length, &bbio, mirror_num, 1); if (ret) { btrfs_bio_counter_dec(fs_info); - return ret; + return errno_to_blk_status(ret); } total_devs = bbio->num_stripes; @@ -6256,7 +6256,7 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, } btrfs_bio_counter_dec(fs_info); - return ret; + return errno_to_blk_status(ret); } if (map_length < length) { @@ -6283,7 +6283,7 @@ int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, dev_nr, async_submit); } btrfs_bio_counter_dec(fs_info); - return 0; + return BLK_STS_OK; } struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6f45fd60d15a..93277fc60930 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -74,7 +74,7 @@ struct btrfs_device { int missing; int can_discard; int is_tgtdev_for_dev_replace; - int last_flush_error; + blk_status_t last_flush_error; int flush_bio_sent; #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED @@ -416,8 +416,8 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 type); void btrfs_mapping_init(struct btrfs_mapping_tree *tree); void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); -int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num, int async_submit); +blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num, int async_submit); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, -- cgit From 311fc65c9fb9c966bca8e6f3ff8132ce57344ab9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 24 Aug 2017 15:13:29 -0500 Subject: pty: Repair TIOCGPTPEER The implementation of TIOCGPTPEER has two issues. When /dev/ptmx (as opposed to /dev/pts/ptmx) is opened the wrong vfsmount is passed to dentry_open. Which results in the kernel displaying the wrong pathname for the peer. The second is simply by caching the vfsmount and dentry of the peer it leaves them open, in a way they were not previously Which because of the inreased reference counts can cause unnecessary behaviour differences resulting in regressions. To fix these move the ioctl into tty_io.c at a generic level allowing the ioctl to have access to the struct file on which the ioctl is being called. This allows the path of the slave to be derived when opening the slave through TIOCGPTPEER instead of requiring the path to the slave be cached. Thus removing the need for caching the path. A new function devpts_ptmx_path is factored out of devpts_acquire and used to implement a function devpts_mntget. The new function devpts_mntget takes a filp to perform the lookup on and fsi so that it can confirm that the superblock that is found by devpts_ptmx_path is the proper superblock. v2: Lots of fixes to make the code actually work v3: Suggestions by Linus - Removed the unnecessary initialization of filp in ptm_open_peer - Simplified devpts_ptmx_path as gotos are no longer required [ This is the fix for the issue that was reverted in commit 143c97cc6529, but this time without breaking 'pbuilder' due to increased reference counts - Linus ] Fixes: 54ebbfb16034 ("tty: add TIOCGPTPEER ioctl") Reported-by: Christian Brauner Reported-and-tested-by: Stefan Lippers-Hollmann Signed-off-by: "Eric W. Biederman" Signed-off-by: Linus Torvalds --- drivers/tty/pty.c | 64 ++++++++++++++++++++-------------------------- drivers/tty/tty_io.c | 3 +++ fs/devpts/inode.c | 65 +++++++++++++++++++++++++++++++++++------------ include/linux/devpts_fs.h | 10 ++++++++ 4 files changed, 89 insertions(+), 53 deletions(-) (limited to 'fs') diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index 284749fb0f6b..a6d5164c33a9 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -69,13 +69,8 @@ static void pty_close(struct tty_struct *tty, struct file *filp) #ifdef CONFIG_UNIX98_PTYS if (tty->driver == ptm_driver) { mutex_lock(&devpts_mutex); - if (tty->link->driver_data) { - struct path *path = tty->link->driver_data; - - devpts_pty_kill(path->dentry); - path_put(path); - kfree(path); - } + if (tty->link->driver_data) + devpts_pty_kill(tty->link->driver_data); mutex_unlock(&devpts_mutex); } #endif @@ -607,25 +602,24 @@ static inline void legacy_pty_init(void) { } static struct cdev ptmx_cdev; /** - * pty_open_peer - open the peer of a pty - * @tty: the peer of the pty being opened + * ptm_open_peer - open the peer of a pty + * @master: the open struct file of the ptmx device node + * @tty: the master of the pty being opened + * @flags: the flags for open * - * Open the cached dentry in tty->link, providing a safe way for userspace - * to get the slave end of a pty (where they have the master fd and cannot - * access or trust the mount namespace /dev/pts was mounted inside). + * Provide a race free way for userspace to open the slave end of a pty + * (where they have the master fd and cannot access or trust the mount + * namespace /dev/pts was mounted inside). */ -static struct file *pty_open_peer(struct tty_struct *tty, int flags) -{ - if (tty->driver->subtype != PTY_TYPE_MASTER) - return ERR_PTR(-EIO); - return dentry_open(tty->link->driver_data, flags, current_cred()); -} - -static int pty_get_peer(struct tty_struct *tty, int flags) +int ptm_open_peer(struct file *master, struct tty_struct *tty, int flags) { int fd = -1; - struct file *filp = NULL; + struct file *filp; int retval = -EINVAL; + struct path path; + + if (tty->driver != ptm_driver) + return -EIO; fd = get_unused_fd_flags(0); if (fd < 0) { @@ -633,7 +627,16 @@ static int pty_get_peer(struct tty_struct *tty, int flags) goto err; } - filp = pty_open_peer(tty, flags); + /* Compute the slave's path */ + path.mnt = devpts_mntget(master, tty->driver_data); + if (IS_ERR(path.mnt)) { + retval = PTR_ERR(path.mnt); + goto err_put; + } + path.dentry = tty->link->driver_data; + + filp = dentry_open(&path, flags, current_cred()); + mntput(path.mnt); if (IS_ERR(filp)) { retval = PTR_ERR(filp); goto err_put; @@ -662,8 +665,6 @@ static int pty_unix98_ioctl(struct tty_struct *tty, return pty_get_pktmode(tty, (int __user *)arg); case TIOCGPTN: /* Get PT Number */ return put_user(tty->index, (unsigned int __user *)arg); - case TIOCGPTPEER: /* Open the other end */ - return pty_get_peer(tty, (int) arg); case TIOCSIG: /* Send signal to other side of pty */ return pty_signal(tty, (int) arg); } @@ -791,7 +792,6 @@ static int ptmx_open(struct inode *inode, struct file *filp) { struct pts_fs_info *fsi; struct tty_struct *tty; - struct path *pts_path; struct dentry *dentry; int retval; int index; @@ -845,26 +845,16 @@ static int ptmx_open(struct inode *inode, struct file *filp) retval = PTR_ERR(dentry); goto err_release; } - /* We need to cache a fake path for TIOCGPTPEER. */ - pts_path = kmalloc(sizeof(struct path), GFP_KERNEL); - if (!pts_path) - goto err_release; - pts_path->mnt = filp->f_path.mnt; - pts_path->dentry = dentry; - path_get(pts_path); - tty->link->driver_data = pts_path; + tty->link->driver_data = dentry; retval = ptm_driver->ops->open(tty, filp); if (retval) - goto err_path_put; + goto err_release; tty_debug_hangup(tty, "opening (count=%d)\n", tty->count); tty_unlock(tty); return 0; -err_path_put: - path_put(pts_path); - kfree(pts_path); err_release: tty_unlock(tty); // This will also put-ref the fsi diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 974b13d24401..10c4038c0e8d 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -2518,6 +2518,9 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case TIOCSSERIAL: tty_warn_deprecated_flags(p); break; + case TIOCGPTPEER: + /* Special because the struct file is needed */ + return ptm_open_peer(file, tty, (int)arg); default: retval = tty_jobctrl_ioctl(tty, real_tty, file, cmd, arg); if (retval != -ENOIOCTLCMD) diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 108df2e3602c..7eae33ffa3fc 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -133,6 +133,50 @@ static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb) return sb->s_fs_info; } +static int devpts_ptmx_path(struct path *path) +{ + struct super_block *sb; + int err; + + /* Has the devpts filesystem already been found? */ + if (path->mnt->mnt_sb->s_magic == DEVPTS_SUPER_MAGIC) + return 0; + + /* Is a devpts filesystem at "pts" in the same directory? */ + err = path_pts(path); + if (err) + return err; + + /* Is the path the root of a devpts filesystem? */ + sb = path->mnt->mnt_sb; + if ((sb->s_magic != DEVPTS_SUPER_MAGIC) || + (path->mnt->mnt_root != sb->s_root)) + return -ENODEV; + + return 0; +} + +struct vfsmount *devpts_mntget(struct file *filp, struct pts_fs_info *fsi) +{ + struct path path; + int err; + + path = filp->f_path; + path_get(&path); + + err = devpts_ptmx_path(&path); + dput(path.dentry); + if (err) { + mntput(path.mnt); + path.mnt = ERR_PTR(err); + } + if (DEVPTS_SB(path.mnt->mnt_sb) != fsi) { + mntput(path.mnt); + path.mnt = ERR_PTR(-ENODEV); + } + return path.mnt; +} + struct pts_fs_info *devpts_acquire(struct file *filp) { struct pts_fs_info *result; @@ -143,27 +187,16 @@ struct pts_fs_info *devpts_acquire(struct file *filp) path = filp->f_path; path_get(&path); - /* Has the devpts filesystem already been found? */ - sb = path.mnt->mnt_sb; - if (sb->s_magic != DEVPTS_SUPER_MAGIC) { - /* Is a devpts filesystem at "pts" in the same directory? */ - err = path_pts(&path); - if (err) { - result = ERR_PTR(err); - goto out; - } - - /* Is the path the root of a devpts filesystem? */ - result = ERR_PTR(-ENODEV); - sb = path.mnt->mnt_sb; - if ((sb->s_magic != DEVPTS_SUPER_MAGIC) || - (path.mnt->mnt_root != sb->s_root)) - goto out; + err = devpts_ptmx_path(&path); + if (err) { + result = ERR_PTR(err); + goto out; } /* * pty code needs to hold extra references in case of last /dev/tty close */ + sb = path.mnt->mnt_sb; atomic_inc(&sb->s_active); result = DEVPTS_SB(sb); diff --git a/include/linux/devpts_fs.h b/include/linux/devpts_fs.h index 277ab9af9ac2..100cb4343763 100644 --- a/include/linux/devpts_fs.h +++ b/include/linux/devpts_fs.h @@ -19,6 +19,7 @@ struct pts_fs_info; +struct vfsmount *devpts_mntget(struct file *, struct pts_fs_info *); struct pts_fs_info *devpts_acquire(struct file *); void devpts_release(struct pts_fs_info *); @@ -32,6 +33,15 @@ void *devpts_get_priv(struct dentry *); /* unlink */ void devpts_pty_kill(struct dentry *); +/* in pty.c */ +int ptm_open_peer(struct file *master, struct tty_struct *tty, int flags); + +#else +static inline int +ptm_open_peer(struct file *master, struct tty_struct *tty, int flags) +{ + return -EIO; +} #endif -- cgit