diff options
Diffstat (limited to 'fs/xfs/xfs_alloc.c')
-rw-r--r-- | fs/xfs/xfs_alloc.c | 421 |
1 files changed, 285 insertions, 136 deletions
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 275b1f4f9430..a7fbe8a99b12 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -46,11 +46,9 @@ #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -STATIC void -xfs_alloc_search_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len); +static int +xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bno, xfs_extlen_t len); /* * Prototypes for per-ag allocation routines @@ -540,9 +538,16 @@ xfs_alloc_ag_vextent( be32_to_cpu(agf->agf_length)); xfs_alloc_log_agf(args->tp, args->agbp, XFS_AGF_FREEBLKS); - /* search the busylist for these blocks */ - xfs_alloc_search_busy(args->tp, args->agno, - args->agbno, args->len); + /* + * Search the busylist for these blocks and mark the + * transaction as synchronous if blocks are found. This + * avoids the need to block due to a synchronous log + * force to ensure correct ordering as the synchronous + * transaction will guarantee that for us. + */ + if (xfs_alloc_busy_search(args->mp, args->agno, + args->agbno, args->len)) + xfs_trans_set_sync(args->tp); } if (!args->isfl) xfs_trans_mod_sb(args->tp, @@ -1662,11 +1667,13 @@ xfs_free_ag_extent( xfs_agf_t *agf; xfs_perag_t *pag; /* per allocation group data */ + pag = xfs_perag_get(mp, agno); + pag->pagf_freeblks += len; + xfs_perag_put(pag); + agf = XFS_BUF_TO_AGF(agbp); - pag = &mp->m_perag[agno]; be32_add_cpu(&agf->agf_freeblks, len); xfs_trans_agblocks_delta(tp, len); - pag->pagf_freeblks += len; XFS_WANT_CORRUPTED_GOTO( be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length), @@ -1691,7 +1698,7 @@ xfs_free_ag_extent( * when the iclog commits to disk. If a busy block is allocated, * the iclog is pushed up to the LSN that freed the block. */ - xfs_alloc_mark_busy(tp, agno, bno, len); + xfs_alloc_busy_insert(tp, agno, bno, len); return 0; error0: @@ -1969,10 +1976,12 @@ xfs_alloc_get_freelist( xfs_trans_brelse(tp, agflbp); if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp)) agf->agf_flfirst = 0; - pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)]; + + pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); be32_add_cpu(&agf->agf_flcount, -1); xfs_trans_agflist_delta(tp, -1); pag->pagf_flcount--; + xfs_perag_put(pag); logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; if (btreeblk) { @@ -1985,14 +1994,20 @@ xfs_alloc_get_freelist( *bnop = bno; /* - * As blocks are freed, they are added to the per-ag busy list - * and remain there until the freeing transaction is committed to - * disk. Now that we have allocated blocks, this list must be - * searched to see if a block is being reused. If one is, then - * the freeing transaction must be pushed to disk NOW by forcing - * to disk all iclogs up that transaction's LSN. + * As blocks are freed, they are added to the per-ag busy list and + * remain there until the freeing transaction is committed to disk. + * Now that we have allocated blocks, this list must be searched to see + * if a block is being reused. If one is, then the freeing transaction + * must be pushed to disk before this transaction. + * + * We do this by setting the current transaction to a sync transaction + * which guarantees that the freeing transaction is on disk before this + * transaction. This is done instead of a synchronous log force here so + * that we don't sit and wait with the AGF locked in the transaction + * during the log force. */ - xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1); + if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1)) + xfs_trans_set_sync(tp); return 0; } @@ -2078,7 +2093,8 @@ xfs_alloc_put_freelist( be32_add_cpu(&agf->agf_fllast, 1); if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp)) agf->agf_fllast = 0; - pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)]; + + pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno)); be32_add_cpu(&agf->agf_flcount, 1); xfs_trans_agflist_delta(tp, 1); pag->pagf_flcount++; @@ -2089,6 +2105,7 @@ xfs_alloc_put_freelist( pag->pagf_btreeblks--; logflags |= XFS_AGF_BTREEBLKS; } + xfs_perag_put(pag); xfs_alloc_log_agf(tp, agbp, logflags); @@ -2152,7 +2169,6 @@ xfs_read_agf( xfs_trans_brelse(tp, *bpp); return XFS_ERROR(EFSCORRUPTED); } - XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF); return 0; } @@ -2175,7 +2191,7 @@ xfs_alloc_read_agf( ASSERT(agno != NULLAGNUMBER); error = xfs_read_agf(mp, tp, agno, - (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0, + (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, bpp); if (error) return error; @@ -2184,7 +2200,7 @@ xfs_alloc_read_agf( ASSERT(!XFS_BUF_GETERROR(*bpp)); agf = XFS_BUF_TO_AGF(*bpp); - pag = &mp->m_perag[agno]; + pag = xfs_perag_get(mp, agno); if (!pag->pagf_init) { pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); @@ -2195,8 +2211,8 @@ xfs_alloc_read_agf( pag->pagf_levels[XFS_BTNUM_CNTi] = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); spin_lock_init(&pag->pagb_lock); - pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS * - sizeof(xfs_perag_busy_t), KM_SLEEP); + pag->pagb_count = 0; + pag->pagb_tree = RB_ROOT; pag->pagf_init = 1; } #ifdef DEBUG @@ -2211,6 +2227,7 @@ xfs_alloc_read_agf( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); } #endif + xfs_perag_put(pag); return 0; } @@ -2270,8 +2287,7 @@ xfs_alloc_vextent( * These three force us into a single a.g. */ args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); - down_read(&mp->m_peraglock); - args->pag = &mp->m_perag[args->agno]; + args->pag = xfs_perag_get(mp, args->agno); args->minleft = 0; error = xfs_alloc_fix_freelist(args, 0); args->minleft = minleft; @@ -2280,14 +2296,12 @@ xfs_alloc_vextent( goto error0; } if (!args->agbp) { - up_read(&mp->m_peraglock); trace_xfs_alloc_vextent_noagbp(args); break; } args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); if ((error = xfs_alloc_ag_vextent(args))) goto error0; - up_read(&mp->m_peraglock); break; case XFS_ALLOCTYPE_START_BNO: /* @@ -2339,9 +2353,8 @@ xfs_alloc_vextent( * Loop over allocation groups twice; first time with * trylock set, second time without. */ - down_read(&mp->m_peraglock); for (;;) { - args->pag = &mp->m_perag[args->agno]; + args->pag = xfs_perag_get(mp, args->agno); if (no_min) args->minleft = 0; error = xfs_alloc_fix_freelist(args, flags); args->minleft = minleft; @@ -2400,8 +2413,8 @@ xfs_alloc_vextent( } } } + xfs_perag_put(args->pag); } - up_read(&mp->m_peraglock); if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) { if (args->agno == sagno) mp->m_agfrotor = (mp->m_agfrotor + 1) % @@ -2427,9 +2440,10 @@ xfs_alloc_vextent( args->len); #endif } + xfs_perag_put(args->pag); return 0; error0: - up_read(&mp->m_peraglock); + xfs_perag_put(args->pag); return error; } @@ -2454,8 +2468,7 @@ xfs_free_extent( args.agno = XFS_FSB_TO_AGNO(args.mp, bno); ASSERT(args.agno < args.mp->m_sb.sb_agcount); args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); - down_read(&args.mp->m_peraglock); - args.pag = &args.mp->m_perag[args.agno]; + args.pag = xfs_perag_get(args.mp, args.agno); if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) goto error0; #ifdef DEBUG @@ -2465,7 +2478,7 @@ xfs_free_extent( #endif error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); error0: - up_read(&args.mp->m_peraglock); + xfs_perag_put(args.pag); return error; } @@ -2477,127 +2490,263 @@ error0: * list is reused, the transaction that freed it must be forced to disk * before continuing to use the block. * - * xfs_alloc_mark_busy - add to the per-ag busy list - * xfs_alloc_clear_busy - remove an item from the per-ag busy list + * xfs_alloc_busy_insert - add to the per-ag busy list + * xfs_alloc_busy_clear - remove an item from the per-ag busy list + * xfs_alloc_busy_search - search for a busy extent + */ + +/* + * Insert a new extent into the busy tree. + * + * The busy extent tree is indexed by the start block of the busy extent. + * there can be multiple overlapping ranges in the busy extent tree but only + * ever one entry at a given start block. The reason for this is that + * multi-block extents can be freed, then smaller chunks of that extent + * allocated and freed again before the first transaction commit is on disk. + * If the exact same start block is freed a second time, we have to wait for + * that busy extent to pass out of the tree before the new extent is inserted. + * There are two main cases we have to handle here. + * + * The first case is a transaction that triggers a "free - allocate - free" + * cycle. This can occur during btree manipulations as a btree block is freed + * to the freelist, then allocated from the free list, then freed again. In + * this case, the second extxpnet free is what triggers the duplicate and as + * such the transaction IDs should match. Because the extent was allocated in + * this transaction, the transaction must be marked as synchronous. This is + * true for all cases where the free/alloc/free occurs in the one transaction, + * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case. + * This serves to catch violations of the second case quite effectively. + * + * The second case is where the free/alloc/free occur in different + * transactions. In this case, the thread freeing the extent the second time + * can't mark the extent busy immediately because it is already tracked in a + * transaction that may be committing. When the log commit for the existing + * busy extent completes, the busy extent will be removed from the tree. If we + * allow the second busy insert to continue using that busy extent structure, + * it can be freed before this transaction is safely in the log. Hence our + * only option in this case is to force the log to remove the existing busy + * extent from the list before we insert the new one with the current + * transaction ID. + * + * The problem we are trying to avoid in the free-alloc-free in separate + * transactions is most easily described with a timeline: + * + * Thread 1 Thread 2 Thread 3 xfslogd + * xact alloc + * free X + * mark busy + * commit xact + * free xact + * xact alloc + * alloc X + * busy search + * mark xact sync + * commit xact + * free xact + * force log + * checkpoint starts + * .... + * xact alloc + * free X + * mark busy + * finds match + * *** KABOOM! *** + * .... + * log IO completes + * unbusy X + * checkpoint completes + * + * By issuing a log force in thread 3 @ "KABOOM", the thread will block until + * the checkpoint completes, and the busy extent it matched will have been + * removed from the tree when it is woken. Hence it can then continue safely. + * + * However, to ensure this matching process is robust, we need to use the + * transaction ID for identifying transaction, as delayed logging results in + * the busy extent and transaction lifecycles being different. i.e. the busy + * extent is active for a lot longer than the transaction. Hence the + * transaction structure can be freed and reallocated, then mark the same + * extent busy again in the new transaction. In this case the new transaction + * will have a different tid but can have the same address, and hence we need + * to check against the tid. + * + * Future: for delayed logging, we could avoid the log force if the extent was + * first freed in the current checkpoint sequence. This, however, requires the + * ability to pin the current checkpoint in memory until this transaction + * commits to ensure that both the original free and the current one combine + * logically into the one checkpoint. If the checkpoint sequences are + * different, however, we still need to wait on a log force. */ void -xfs_alloc_mark_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) +xfs_alloc_busy_insert( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) { - xfs_mount_t *mp; - xfs_perag_busy_t *bsy; - int n; + struct xfs_busy_extent *new; + struct xfs_busy_extent *busyp; + struct xfs_perag *pag; + struct rb_node **rbp; + struct rb_node *parent; + int match; - mp = tp->t_mountp; - spin_lock(&mp->m_perag[agno].pagb_lock); - - /* search pagb_list for an open slot */ - for (bsy = mp->m_perag[agno].pagb_list, n = 0; - n < XFS_PAGB_NUM_SLOTS; - bsy++, n++) { - if (bsy->busy_tp == NULL) { - break; - } - } - trace_xfs_alloc_busy(mp, agno, bno, len, n); - - if (n < XFS_PAGB_NUM_SLOTS) { - bsy = &mp->m_perag[agno].pagb_list[n]; - mp->m_perag[agno].pagb_count++; - bsy->busy_start = bno; - bsy->busy_length = len; - bsy->busy_tp = tp; - xfs_trans_add_busy(tp, agno, n); - } else { + new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); + if (!new) { /* - * The busy list is full! Since it is now not possible to - * track the free block, make this a synchronous transaction - * to insure that the block is not reused before this - * transaction commits. + * No Memory! Since it is now not possible to track the free + * block, make this a synchronous transaction to insure that + * the block is not reused before this transaction commits. */ + trace_xfs_alloc_busy(tp, agno, bno, len, 1); xfs_trans_set_sync(tp); + return; } - spin_unlock(&mp->m_perag[agno].pagb_lock); -} - -void -xfs_alloc_clear_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - int idx) -{ - xfs_mount_t *mp; - xfs_perag_busy_t *list; - - mp = tp->t_mountp; - - spin_lock(&mp->m_perag[agno].pagb_lock); - list = mp->m_perag[agno].pagb_list; - - ASSERT(idx < XFS_PAGB_NUM_SLOTS); - - trace_xfs_alloc_unbusy(mp, agno, idx, list[idx].busy_tp == tp); - - if (list[idx].busy_tp == tp) { - list[idx].busy_tp = NULL; - mp->m_perag[agno].pagb_count--; + new->agno = agno; + new->bno = bno; + new->length = len; + new->tid = xfs_log_get_trans_ident(tp); + + INIT_LIST_HEAD(&new->list); + + /* trace before insert to be able to see failed inserts */ + trace_xfs_alloc_busy(tp, agno, bno, len, 0); + + pag = xfs_perag_get(tp->t_mountp, new->agno); +restart: + spin_lock(&pag->pagb_lock); + rbp = &pag->pagb_tree.rb_node; + parent = NULL; + busyp = NULL; + match = 0; + while (*rbp && match >= 0) { + parent = *rbp; + busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); + + if (new->bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + rbp = &(*rbp)->rb_left; + if (new->bno + new->length > busyp->bno) + match = busyp->tid == new->tid ? 1 : -1; + } else if (new->bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + rbp = &(*rbp)->rb_right; + if (bno < busyp->bno + busyp->length) + match = busyp->tid == new->tid ? 1 : -1; + } else { + match = busyp->tid == new->tid ? 1 : -1; + break; + } + } + if (match < 0) { + /* overlap marked busy in different transaction */ + spin_unlock(&pag->pagb_lock); + xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); + goto restart; } + if (match > 0) { + /* + * overlap marked busy in same transaction. Update if exact + * start block match, otherwise combine the busy extents into + * a single range. + */ + if (busyp->bno == new->bno) { + busyp->length = max(busyp->length, new->length); + spin_unlock(&pag->pagb_lock); + ASSERT(tp->t_flags & XFS_TRANS_SYNC); + xfs_perag_put(pag); + kmem_free(new); + return; + } + rb_erase(&busyp->rb_node, &pag->pagb_tree); + new->length = max(busyp->bno + busyp->length, + new->bno + new->length) - + min(busyp->bno, new->bno); + new->bno = min(busyp->bno, new->bno); + } else + busyp = NULL; - spin_unlock(&mp->m_perag[agno].pagb_lock); -} + rb_link_node(&new->rb_node, parent, rbp); + rb_insert_color(&new->rb_node, &pag->pagb_tree); + list_add(&new->list, &tp->t_busy); + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + kmem_free(busyp); +} /* - * If we find the extent in the busy list, force the log out to get the - * extent out of the busy list so the caller can use it straight away. + * Search for a busy extent within the range of the extent we are about to + * allocate. You need to be holding the busy extent tree lock when calling + * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy + * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact + * match. This is done so that a non-zero return indicates an overlap that + * will require a synchronous transaction, but it can still be + * used to distinguish between a partial or exact match. */ -STATIC void -xfs_alloc_search_busy(xfs_trans_t *tp, - xfs_agnumber_t agno, - xfs_agblock_t bno, - xfs_extlen_t len) +static int +xfs_alloc_busy_search( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t bno, + xfs_extlen_t len) { - xfs_mount_t *mp; - xfs_perag_busy_t *bsy; - xfs_agblock_t uend, bend; - xfs_lsn_t lsn = 0; - int cnt; - - mp = tp->t_mountp; + struct xfs_perag *pag; + struct rb_node *rbp; + struct xfs_busy_extent *busyp; + int match = 0; + + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); + + rbp = pag->pagb_tree.rb_node; + + /* find closest start bno overlap */ + while (rbp) { + busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node); + if (bno < busyp->bno) { + /* may overlap, but exact start block is lower */ + if (bno + len > busyp->bno) + match = -1; + rbp = rbp->rb_left; + } else if (bno > busyp->bno) { + /* may overlap, but exact start block is higher */ + if (bno < busyp->bno + busyp->length) + match = -1; + rbp = rbp->rb_right; + } else { + /* bno matches busyp, length determines exact match */ + match = (busyp->length == len) ? 1 : -1; + break; + } + } + spin_unlock(&pag->pagb_lock); + trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match); + xfs_perag_put(pag); + return match; +} - spin_lock(&mp->m_perag[agno].pagb_lock); +void +xfs_alloc_busy_clear( + struct xfs_mount *mp, + struct xfs_busy_extent *busyp) +{ + struct xfs_perag *pag; - uend = bno + len - 1; + trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno, + busyp->length); - /* - * search pagb_list for this slot, skipping open slots. We have to - * search the entire array as there may be multiple overlaps and - * we have to get the most recent LSN for the log force to push out - * all the transactions that span the range. - */ - for (cnt = 0; cnt < mp->m_perag[agno].pagb_count; cnt++) { - bsy = &mp->m_perag[agno].pagb_list[cnt]; - if (!bsy->busy_tp) - continue; + ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno, + busyp->length) == 1); - bend = bsy->busy_start + bsy->busy_length - 1; - if (bno > bend || uend < bsy->busy_start) - continue; + list_del_init(&busyp->list); - /* (start1,length1) within (start2, length2) */ - if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) - lsn = bsy->busy_tp->t_commit_lsn; - } - spin_unlock(&mp->m_perag[agno].pagb_lock); - trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn); + pag = xfs_perag_get(mp, busyp->agno); + spin_lock(&pag->pagb_lock); + rb_erase(&busyp->rb_node, &pag->pagb_tree); + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); - /* - * If a block was found, force the log through the LSN of the - * transaction that freed the block - */ - if (lsn) - xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC); + kmem_free(busyp); } |