From b1438f477934f5a4d5a44df26f3079a7575d5946 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 18 May 2016 13:53:42 +1000 Subject: xfs: xfs_iflush_cluster fails to abort on error When a failure due to an inode buffer occurs, the error handling fails to abort the inode writeback correctly. This can result in the inode being reclaimed whilst still in the AIL, leading to use-after-free situations as well as filesystems that cannot be unmounted as the inode log items left in the AIL never get removed. Fix this by ensuring fatal errors from xfs_imap_to_bp() result in the inode flush being aborted correctly. cc: # 3.10.x- Reported-by: Shyam Kaushik Diagnosed-by: Shyam Kaushik Tested-by: Shyam Kaushik Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 96f606deee31..696936cad0fa 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3327,7 +3327,7 @@ xfs_iflush( struct xfs_buf **bpp) { struct xfs_mount *mp = ip->i_mount; - struct xfs_buf *bp; + struct xfs_buf *bp = NULL; struct xfs_dinode *dip; int error; @@ -3369,14 +3369,22 @@ xfs_iflush( } /* - * Get the buffer containing the on-disk inode. + * Get the buffer containing the on-disk inode. We are doing a try-lock + * operation here, so we may get an EAGAIN error. In that case, we + * simply want to return with the inode still dirty. + * + * If we get any other error, we effectively have a corruption situation + * and we cannot flush the inode, so we treat it the same as failing + * xfs_iflush_int(). */ error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK, 0); - if (error || !bp) { + if (error == -EAGAIN) { xfs_ifunlock(ip); return error; } + if (error) + goto corrupt_out; /* * First flush out the inode that xfs_iflush was called with. @@ -3404,7 +3412,8 @@ xfs_iflush( return 0; corrupt_out: - xfs_buf_relse(bp); + if (bp) + xfs_buf_relse(bp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); cluster_corrupt_out: error = -EFSCORRUPTED; -- cgit From 51b07f30a71c27405259a0248206ed4e22adbee2 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 18 May 2016 13:54:22 +1000 Subject: xfs: fix inode validity check in xfs_iflush_cluster Some careless idiot(*) wrote crap code in commit 1a3e8f3 ("xfs: convert inode cache lookups to use RCU locking") back in late 2010, and so xfs_iflush_cluster checks the wrong inode for whether it is still valid under RCU protection. Fix it to lock and check the correct inode. (*) Careless-idiot: Dave Chinner cc: # 3.10.x- Discovered-by: Brain Foster Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 696936cad0fa..a955b0212453 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3205,13 +3205,13 @@ xfs_iflush_cluster( * We need to check under the i_flags_lock for a valid inode * here. Skip it if it is not valid or the wrong inode. */ - spin_lock(&ip->i_flags_lock); - if (!ip->i_ino || + spin_lock(&iq->i_flags_lock); + if (!iq->i_ino || (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { - spin_unlock(&ip->i_flags_lock); + spin_unlock(&iq->i_flags_lock); continue; } - spin_unlock(&ip->i_flags_lock); + spin_unlock(&iq->i_flags_lock); /* * Do an un-protected check to see if the inode is dirty and -- cgit From 7d3aa7fe970791f1a674b14572a411accf2f4d4e Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 18 May 2016 13:54:23 +1000 Subject: xfs: skip stale inodes in xfs_iflush_cluster We don't write back stale inodes so we should skip them in xfs_iflush_cluster, too. cc: # 3.10.x- Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index a955b0212453..3cbc9031731b 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3207,6 +3207,7 @@ xfs_iflush_cluster( */ spin_lock(&iq->i_flags_lock); if (!iq->i_ino || + __xfs_iflags_test(iq, XFS_ISTALE) || (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { spin_unlock(&iq->i_flags_lock); continue; -- cgit From 8a17d7ddedb4d9031f046ae0e97c40b46aa69db5 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 18 May 2016 14:09:12 +1000 Subject: xfs: mark reclaimed inodes invalid earlier The last thing we do before using call_rcu() on an xfs_inode to be freed is mark it as invalid. This means there is a window between when we know for certain that the inode is going to be freed and when we do actually mark it as "freed". This is important in the context of RCU lookups - we can look up the inode, find that it is valid, and then use it as such not realising that it is in the final stages of being freed. As such, mark the inode as being invalid the moment we know it is going to be reclaimed. This can be done while we still hold the XFS_ILOCK_EXCL and the flush lock in xfs_inode_reclaim, meaning that it occurs well before we remove it from the radix tree, and that the i_flags_lock, the XFS_ILOCK and the inode flush lock all act as synchronisation points for detecting that an inode is about to go away. For defensive purposes, this allows us to add a further check to xfs_iflush_cluster to ensure we skip inodes that are being freed after we grab the XFS_ILOCK_SHARED and the flush lock - we know that if the inode number if valid while we have these locks held we know that it has not progressed through reclaim to the point where it is clean and is about to be freed. [bfoster: fixed __xfs_inode_clear_reclaim() using ip->i_ino after it had already been zeroed.] Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_icache.c | 46 ++++++++++++++++++++++++++++++++++------------ fs/xfs/xfs_inode.c | 13 +++++++++++++ 2 files changed, 47 insertions(+), 12 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0c94cde41016..57fcd5917a66 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -114,6 +114,18 @@ xfs_inode_free_callback( kmem_zone_free(xfs_inode_zone, ip); } +static void +__xfs_inode_free( + struct xfs_inode *ip) +{ + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!xfs_isiflocked(ip)); + XFS_STATS_DEC(ip->i_mount, vn_active); + + call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); +} + void xfs_inode_free( struct xfs_inode *ip) @@ -129,12 +141,7 @@ xfs_inode_free( ip->i_ino = 0; spin_unlock(&ip->i_flags_lock); - /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!xfs_isiflocked(ip)); - XFS_STATS_DEC(ip->i_mount, vn_active); - - call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); + __xfs_inode_free(ip); } /* @@ -772,8 +779,7 @@ __xfs_inode_set_reclaim_tag( if (!pag->pag_ici_reclaimable) { /* propagate the reclaim tag up into the perag radix tree */ spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_set(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + radix_tree_tag_set(&ip->i_mount->m_perag_tree, pag->pag_agno, XFS_ICI_RECLAIM_TAG); spin_unlock(&ip->i_mount->m_perag_lock); @@ -817,8 +823,7 @@ __xfs_inode_clear_reclaim( if (!pag->pag_ici_reclaimable) { /* clear the reclaim tag from the perag radix tree */ spin_lock(&ip->i_mount->m_perag_lock); - radix_tree_tag_clear(&ip->i_mount->m_perag_tree, - XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, pag->pag_agno, XFS_ICI_RECLAIM_TAG); spin_unlock(&ip->i_mount->m_perag_lock); trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, @@ -929,6 +934,7 @@ xfs_reclaim_inode( int sync_mode) { struct xfs_buf *bp = NULL; + xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ int error; restart: @@ -993,6 +999,22 @@ restart: xfs_iflock(ip); reclaim: + /* + * Because we use RCU freeing we need to ensure the inode always appears + * to be reclaimed with an invalid inode number when in the free state. + * We do this as early as possible under the ILOCK and flush lock so + * that xfs_iflush_cluster() can be guaranteed to detect races with us + * here. By doing this, we guarantee that once xfs_iflush_cluster has + * locked both the XFS_ILOCK and the flush lock that it will see either + * a valid, flushable inode that will serialise correctly against the + * locks below, or it will see a clean (and invalid) inode that it can + * skip. + */ + spin_lock(&ip->i_flags_lock); + ip->i_flags = XFS_IRECLAIM; + ip->i_ino = 0; + spin_unlock(&ip->i_flags_lock); + xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1006,7 +1028,7 @@ reclaim: */ spin_lock(&pag->pag_ici_lock); if (!radix_tree_delete(&pag->pag_ici_root, - XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) + XFS_INO_TO_AGINO(ip->i_mount, ino))) ASSERT(0); __xfs_inode_clear_reclaim(pag, ip); spin_unlock(&pag->pag_ici_lock); @@ -1023,7 +1045,7 @@ reclaim: xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_inode_free(ip); + __xfs_inode_free(ip); return error; out_ifunlock: diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 3cbc9031731b..e3b27982b3b2 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3239,6 +3239,19 @@ xfs_iflush_cluster( continue; } + + /* + * Check the inode number again, just to be certain we are not + * racing with freeing in xfs_reclaim_inode(). See the comments + * in that function for more information as to why the initial + * check is not sufficient. + */ + if (!iq->i_ino) { + xfs_ifunlock(iq); + xfs_iunlock(iq, XFS_ILOCK_SHARED); + continue; + } + /* * arriving here means that this inode can be flushed. First * re-check that it's dirty before flushing. -- cgit From 5a90e53e8124d3ebe4b2a6309fa3c3225c23a62a Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 18 May 2016 14:09:13 +1000 Subject: xfs: xfs_iflush_cluster has range issues xfs_iflush_cluster() does a gang lookup on the radix tree, meaning it can find inodes beyond the current cluster if there is sparse cache population. gang lookups return results in ascending index order, so stop trying to cluster inodes once the first inode outside the cluster mask is detected. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index e3b27982b3b2..6c746d79925e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3207,11 +3207,20 @@ xfs_iflush_cluster( */ spin_lock(&iq->i_flags_lock); if (!iq->i_ino || - __xfs_iflags_test(iq, XFS_ISTALE) || - (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { + __xfs_iflags_test(iq, XFS_ISTALE)) { spin_unlock(&iq->i_flags_lock); continue; } + + /* + * Once we fall off the end of the cluster, no point checking + * any more inodes in the list because they will also all be + * outside the cluster. + */ + if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { + spin_unlock(&iq->i_flags_lock); + break; + } spin_unlock(&iq->i_flags_lock); /* -- cgit From 194293631d009254348f43710a7673bbb84a4172 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 18 May 2016 14:09:46 +1000 Subject: xfs: rename variables in xfs_iflush_cluster for clarity The cluster inode variable uses unconventional naming - iq - which makes it hard to distinguish it between the inode passed into the function - ip - and that is a vector for mistakes to be made. Rename all the cluster inode variables to use a more conventional prefixes to reduce potential future confusion (cilist, cilist_size, cip). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Dave Chinner --- fs/xfs/xfs_inode.c | 74 +++++++++++++++++++++++++++--------------------------- 1 file changed, 37 insertions(+), 37 deletions(-) (limited to 'fs/xfs/xfs_inode.c') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 6c746d79925e..54cc7b05708f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3162,16 +3162,16 @@ out_trans_cancel: STATIC int xfs_iflush_cluster( - xfs_inode_t *ip, - xfs_buf_t *bp) + struct xfs_inode *ip, + struct xfs_buf *bp) { - xfs_mount_t *mp = ip->i_mount; + struct xfs_mount *mp = ip->i_mount; struct xfs_perag *pag; unsigned long first_index, mask; unsigned long inodes_per_cluster; - int ilist_size; - xfs_inode_t **ilist; - xfs_inode_t *iq; + int cilist_size; + struct xfs_inode **cilist; + struct xfs_inode *cip; int nr_found; int clcount = 0; int bufwasdelwri; @@ -3180,23 +3180,23 @@ xfs_iflush_cluster( pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; - ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); - ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS); - if (!ilist) + cilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); + cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS); + if (!cilist) goto out_put; mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1); first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; rcu_read_lock(); /* really need a gang lookup range call here */ - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist, first_index, inodes_per_cluster); if (nr_found == 0) goto out_free; for (i = 0; i < nr_found; i++) { - iq = ilist[i]; - if (iq == ip) + cip = cilist[i]; + if (cip == ip) continue; /* @@ -3205,10 +3205,10 @@ xfs_iflush_cluster( * We need to check under the i_flags_lock for a valid inode * here. Skip it if it is not valid or the wrong inode. */ - spin_lock(&iq->i_flags_lock); - if (!iq->i_ino || - __xfs_iflags_test(iq, XFS_ISTALE)) { - spin_unlock(&iq->i_flags_lock); + spin_lock(&cip->i_flags_lock); + if (!cip->i_ino || + __xfs_iflags_test(cip, XFS_ISTALE)) { + spin_unlock(&cip->i_flags_lock); continue; } @@ -3217,18 +3217,18 @@ xfs_iflush_cluster( * any more inodes in the list because they will also all be * outside the cluster. */ - if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) { - spin_unlock(&iq->i_flags_lock); + if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) { + spin_unlock(&cip->i_flags_lock); break; } - spin_unlock(&iq->i_flags_lock); + spin_unlock(&cip->i_flags_lock); /* * Do an un-protected check to see if the inode is dirty and * is a candidate for flushing. These checks will be repeated * later after the appropriate locks are acquired. */ - if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0) + if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0) continue; /* @@ -3236,15 +3236,15 @@ xfs_iflush_cluster( * then this inode cannot be flushed and is skipped. */ - if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) + if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED)) continue; - if (!xfs_iflock_nowait(iq)) { - xfs_iunlock(iq, XFS_ILOCK_SHARED); + if (!xfs_iflock_nowait(cip)) { + xfs_iunlock(cip, XFS_ILOCK_SHARED); continue; } - if (xfs_ipincount(iq)) { - xfs_ifunlock(iq); - xfs_iunlock(iq, XFS_ILOCK_SHARED); + if (xfs_ipincount(cip)) { + xfs_ifunlock(cip); + xfs_iunlock(cip, XFS_ILOCK_SHARED); continue; } @@ -3255,9 +3255,9 @@ xfs_iflush_cluster( * in that function for more information as to why the initial * check is not sufficient. */ - if (!iq->i_ino) { - xfs_ifunlock(iq); - xfs_iunlock(iq, XFS_ILOCK_SHARED); + if (!cip->i_ino) { + xfs_ifunlock(cip); + xfs_iunlock(cip, XFS_ILOCK_SHARED); continue; } @@ -3265,18 +3265,18 @@ xfs_iflush_cluster( * arriving here means that this inode can be flushed. First * re-check that it's dirty before flushing. */ - if (!xfs_inode_clean(iq)) { + if (!xfs_inode_clean(cip)) { int error; - error = xfs_iflush_int(iq, bp); + error = xfs_iflush_int(cip, bp); if (error) { - xfs_iunlock(iq, XFS_ILOCK_SHARED); + xfs_iunlock(cip, XFS_ILOCK_SHARED); goto cluster_corrupt_out; } clcount++; } else { - xfs_ifunlock(iq); + xfs_ifunlock(cip); } - xfs_iunlock(iq, XFS_ILOCK_SHARED); + xfs_iunlock(cip, XFS_ILOCK_SHARED); } if (clcount) { @@ -3286,7 +3286,7 @@ xfs_iflush_cluster( out_free: rcu_read_unlock(); - kmem_free(ilist); + kmem_free(cilist); out_put: xfs_perag_put(pag); return 0; @@ -3329,8 +3329,8 @@ cluster_corrupt_out: /* * Unlocks the flush lock */ - xfs_iflush_abort(iq, false); - kmem_free(ilist); + xfs_iflush_abort(cip, false); + kmem_free(cilist); xfs_perag_put(pag); return -EFSCORRUPTED; } -- cgit