aboutsummaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_alloc.c')
-rw-r--r--fs/xfs/xfs_alloc.c421
1 files changed, 285 insertions, 136 deletions
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 275b1f4f9430..a7fbe8a99b12 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -46,11 +46,9 @@
#define XFSA_FIXUP_BNO_OK 1
#define XFSA_FIXUP_CNT_OK 2
-STATIC void
-xfs_alloc_search_busy(xfs_trans_t *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len);
+static int
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
+ xfs_agblock_t bno, xfs_extlen_t len);
/*
* Prototypes for per-ag allocation routines
@@ -540,9 +538,16 @@ xfs_alloc_ag_vextent(
be32_to_cpu(agf->agf_length));
xfs_alloc_log_agf(args->tp, args->agbp,
XFS_AGF_FREEBLKS);
- /* search the busylist for these blocks */
- xfs_alloc_search_busy(args->tp, args->agno,
- args->agbno, args->len);
+ /*
+ * Search the busylist for these blocks and mark the
+ * transaction as synchronous if blocks are found. This
+ * avoids the need to block due to a synchronous log
+ * force to ensure correct ordering as the synchronous
+ * transaction will guarantee that for us.
+ */
+ if (xfs_alloc_busy_search(args->mp, args->agno,
+ args->agbno, args->len))
+ xfs_trans_set_sync(args->tp);
}
if (!args->isfl)
xfs_trans_mod_sb(args->tp,
@@ -1662,11 +1667,13 @@ xfs_free_ag_extent(
xfs_agf_t *agf;
xfs_perag_t *pag; /* per allocation group data */
+ pag = xfs_perag_get(mp, agno);
+ pag->pagf_freeblks += len;
+ xfs_perag_put(pag);
+
agf = XFS_BUF_TO_AGF(agbp);
- pag = &mp->m_perag[agno];
be32_add_cpu(&agf->agf_freeblks, len);
xfs_trans_agblocks_delta(tp, len);
- pag->pagf_freeblks += len;
XFS_WANT_CORRUPTED_GOTO(
be32_to_cpu(agf->agf_freeblks) <=
be32_to_cpu(agf->agf_length),
@@ -1691,7 +1698,7 @@ xfs_free_ag_extent(
* when the iclog commits to disk. If a busy block is allocated,
* the iclog is pushed up to the LSN that freed the block.
*/
- xfs_alloc_mark_busy(tp, agno, bno, len);
+ xfs_alloc_busy_insert(tp, agno, bno, len);
return 0;
error0:
@@ -1969,10 +1976,12 @@ xfs_alloc_get_freelist(
xfs_trans_brelse(tp, agflbp);
if (be32_to_cpu(agf->agf_flfirst) == XFS_AGFL_SIZE(mp))
agf->agf_flfirst = 0;
- pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)];
+
+ pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
be32_add_cpu(&agf->agf_flcount, -1);
xfs_trans_agflist_delta(tp, -1);
pag->pagf_flcount--;
+ xfs_perag_put(pag);
logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
if (btreeblk) {
@@ -1985,14 +1994,20 @@ xfs_alloc_get_freelist(
*bnop = bno;
/*
- * As blocks are freed, they are added to the per-ag busy list
- * and remain there until the freeing transaction is committed to
- * disk. Now that we have allocated blocks, this list must be
- * searched to see if a block is being reused. If one is, then
- * the freeing transaction must be pushed to disk NOW by forcing
- * to disk all iclogs up that transaction's LSN.
+ * As blocks are freed, they are added to the per-ag busy list and
+ * remain there until the freeing transaction is committed to disk.
+ * Now that we have allocated blocks, this list must be searched to see
+ * if a block is being reused. If one is, then the freeing transaction
+ * must be pushed to disk before this transaction.
+ *
+ * We do this by setting the current transaction to a sync transaction
+ * which guarantees that the freeing transaction is on disk before this
+ * transaction. This is done instead of a synchronous log force here so
+ * that we don't sit and wait with the AGF locked in the transaction
+ * during the log force.
*/
- xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+ if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
+ xfs_trans_set_sync(tp);
return 0;
}
@@ -2078,7 +2093,8 @@ xfs_alloc_put_freelist(
be32_add_cpu(&agf->agf_fllast, 1);
if (be32_to_cpu(agf->agf_fllast) == XFS_AGFL_SIZE(mp))
agf->agf_fllast = 0;
- pag = &mp->m_perag[be32_to_cpu(agf->agf_seqno)];
+
+ pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
be32_add_cpu(&agf->agf_flcount, 1);
xfs_trans_agflist_delta(tp, 1);
pag->pagf_flcount++;
@@ -2089,6 +2105,7 @@ xfs_alloc_put_freelist(
pag->pagf_btreeblks--;
logflags |= XFS_AGF_BTREEBLKS;
}
+ xfs_perag_put(pag);
xfs_alloc_log_agf(tp, agbp, logflags);
@@ -2152,7 +2169,6 @@ xfs_read_agf(
xfs_trans_brelse(tp, *bpp);
return XFS_ERROR(EFSCORRUPTED);
}
-
XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
return 0;
}
@@ -2175,7 +2191,7 @@ xfs_alloc_read_agf(
ASSERT(agno != NULLAGNUMBER);
error = xfs_read_agf(mp, tp, agno,
- (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0,
+ (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
bpp);
if (error)
return error;
@@ -2184,7 +2200,7 @@ xfs_alloc_read_agf(
ASSERT(!XFS_BUF_GETERROR(*bpp));
agf = XFS_BUF_TO_AGF(*bpp);
- pag = &mp->m_perag[agno];
+ pag = xfs_perag_get(mp, agno);
if (!pag->pagf_init) {
pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
@@ -2195,8 +2211,8 @@ xfs_alloc_read_agf(
pag->pagf_levels[XFS_BTNUM_CNTi] =
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
spin_lock_init(&pag->pagb_lock);
- pag->pagb_list = kmem_zalloc(XFS_PAGB_NUM_SLOTS *
- sizeof(xfs_perag_busy_t), KM_SLEEP);
+ pag->pagb_count = 0;
+ pag->pagb_tree = RB_ROOT;
pag->pagf_init = 1;
}
#ifdef DEBUG
@@ -2211,6 +2227,7 @@ xfs_alloc_read_agf(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
}
#endif
+ xfs_perag_put(pag);
return 0;
}
@@ -2270,8 +2287,7 @@ xfs_alloc_vextent(
* These three force us into a single a.g.
*/
args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
- down_read(&mp->m_peraglock);
- args->pag = &mp->m_perag[args->agno];
+ args->pag = xfs_perag_get(mp, args->agno);
args->minleft = 0;
error = xfs_alloc_fix_freelist(args, 0);
args->minleft = minleft;
@@ -2280,14 +2296,12 @@ xfs_alloc_vextent(
goto error0;
}
if (!args->agbp) {
- up_read(&mp->m_peraglock);
trace_xfs_alloc_vextent_noagbp(args);
break;
}
args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
if ((error = xfs_alloc_ag_vextent(args)))
goto error0;
- up_read(&mp->m_peraglock);
break;
case XFS_ALLOCTYPE_START_BNO:
/*
@@ -2339,9 +2353,8 @@ xfs_alloc_vextent(
* Loop over allocation groups twice; first time with
* trylock set, second time without.
*/
- down_read(&mp->m_peraglock);
for (;;) {
- args->pag = &mp->m_perag[args->agno];
+ args->pag = xfs_perag_get(mp, args->agno);
if (no_min) args->minleft = 0;
error = xfs_alloc_fix_freelist(args, flags);
args->minleft = minleft;
@@ -2400,8 +2413,8 @@ xfs_alloc_vextent(
}
}
}
+ xfs_perag_put(args->pag);
}
- up_read(&mp->m_peraglock);
if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
if (args->agno == sagno)
mp->m_agfrotor = (mp->m_agfrotor + 1) %
@@ -2427,9 +2440,10 @@ xfs_alloc_vextent(
args->len);
#endif
}
+ xfs_perag_put(args->pag);
return 0;
error0:
- up_read(&mp->m_peraglock);
+ xfs_perag_put(args->pag);
return error;
}
@@ -2454,8 +2468,7 @@ xfs_free_extent(
args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
ASSERT(args.agno < args.mp->m_sb.sb_agcount);
args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
- down_read(&args.mp->m_peraglock);
- args.pag = &args.mp->m_perag[args.agno];
+ args.pag = xfs_perag_get(args.mp, args.agno);
if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
goto error0;
#ifdef DEBUG
@@ -2465,7 +2478,7 @@ xfs_free_extent(
#endif
error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
error0:
- up_read(&args.mp->m_peraglock);
+ xfs_perag_put(args.pag);
return error;
}
@@ -2477,127 +2490,263 @@ error0:
* list is reused, the transaction that freed it must be forced to disk
* before continuing to use the block.
*
- * xfs_alloc_mark_busy - add to the per-ag busy list
- * xfs_alloc_clear_busy - remove an item from the per-ag busy list
+ * xfs_alloc_busy_insert - add to the per-ag busy list
+ * xfs_alloc_busy_clear - remove an item from the per-ag busy list
+ * xfs_alloc_busy_search - search for a busy extent
+ */
+
+/*
+ * Insert a new extent into the busy tree.
+ *
+ * The busy extent tree is indexed by the start block of the busy extent.
+ * there can be multiple overlapping ranges in the busy extent tree but only
+ * ever one entry at a given start block. The reason for this is that
+ * multi-block extents can be freed, then smaller chunks of that extent
+ * allocated and freed again before the first transaction commit is on disk.
+ * If the exact same start block is freed a second time, we have to wait for
+ * that busy extent to pass out of the tree before the new extent is inserted.
+ * There are two main cases we have to handle here.
+ *
+ * The first case is a transaction that triggers a "free - allocate - free"
+ * cycle. This can occur during btree manipulations as a btree block is freed
+ * to the freelist, then allocated from the free list, then freed again. In
+ * this case, the second extxpnet free is what triggers the duplicate and as
+ * such the transaction IDs should match. Because the extent was allocated in
+ * this transaction, the transaction must be marked as synchronous. This is
+ * true for all cases where the free/alloc/free occurs in the one transaction,
+ * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
+ * This serves to catch violations of the second case quite effectively.
+ *
+ * The second case is where the free/alloc/free occur in different
+ * transactions. In this case, the thread freeing the extent the second time
+ * can't mark the extent busy immediately because it is already tracked in a
+ * transaction that may be committing. When the log commit for the existing
+ * busy extent completes, the busy extent will be removed from the tree. If we
+ * allow the second busy insert to continue using that busy extent structure,
+ * it can be freed before this transaction is safely in the log. Hence our
+ * only option in this case is to force the log to remove the existing busy
+ * extent from the list before we insert the new one with the current
+ * transaction ID.
+ *
+ * The problem we are trying to avoid in the free-alloc-free in separate
+ * transactions is most easily described with a timeline:
+ *
+ * Thread 1 Thread 2 Thread 3 xfslogd
+ * xact alloc
+ * free X
+ * mark busy
+ * commit xact
+ * free xact
+ * xact alloc
+ * alloc X
+ * busy search
+ * mark xact sync
+ * commit xact
+ * free xact
+ * force log
+ * checkpoint starts
+ * ....
+ * xact alloc
+ * free X
+ * mark busy
+ * finds match
+ * *** KABOOM! ***
+ * ....
+ * log IO completes
+ * unbusy X
+ * checkpoint completes
+ *
+ * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
+ * the checkpoint completes, and the busy extent it matched will have been
+ * removed from the tree when it is woken. Hence it can then continue safely.
+ *
+ * However, to ensure this matching process is robust, we need to use the
+ * transaction ID for identifying transaction, as delayed logging results in
+ * the busy extent and transaction lifecycles being different. i.e. the busy
+ * extent is active for a lot longer than the transaction. Hence the
+ * transaction structure can be freed and reallocated, then mark the same
+ * extent busy again in the new transaction. In this case the new transaction
+ * will have a different tid but can have the same address, and hence we need
+ * to check against the tid.
+ *
+ * Future: for delayed logging, we could avoid the log force if the extent was
+ * first freed in the current checkpoint sequence. This, however, requires the
+ * ability to pin the current checkpoint in memory until this transaction
+ * commits to ensure that both the original free and the current one combine
+ * logically into the one checkpoint. If the checkpoint sequences are
+ * different, however, we still need to wait on a log force.
*/
void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len)
+xfs_alloc_busy_insert(
+ struct xfs_trans *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len)
{
- xfs_mount_t *mp;
- xfs_perag_busy_t *bsy;
- int n;
+ struct xfs_busy_extent *new;
+ struct xfs_busy_extent *busyp;
+ struct xfs_perag *pag;
+ struct rb_node **rbp;
+ struct rb_node *parent;
+ int match;
- mp = tp->t_mountp;
- spin_lock(&mp->m_perag[agno].pagb_lock);
-
- /* search pagb_list for an open slot */
- for (bsy = mp->m_perag[agno].pagb_list, n = 0;
- n < XFS_PAGB_NUM_SLOTS;
- bsy++, n++) {
- if (bsy->busy_tp == NULL) {
- break;
- }
- }
- trace_xfs_alloc_busy(mp, agno, bno, len, n);
-
- if (n < XFS_PAGB_NUM_SLOTS) {
- bsy = &mp->m_perag[agno].pagb_list[n];
- mp->m_perag[agno].pagb_count++;
- bsy->busy_start = bno;
- bsy->busy_length = len;
- bsy->busy_tp = tp;
- xfs_trans_add_busy(tp, agno, n);
- } else {
+ new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
+ if (!new) {
/*
- * The busy list is full! Since it is now not possible to
- * track the free block, make this a synchronous transaction
- * to insure that the block is not reused before this
- * transaction commits.
+ * No Memory! Since it is now not possible to track the free
+ * block, make this a synchronous transaction to insure that
+ * the block is not reused before this transaction commits.
*/
+ trace_xfs_alloc_busy(tp, agno, bno, len, 1);
xfs_trans_set_sync(tp);
+ return;
}
- spin_unlock(&mp->m_perag[agno].pagb_lock);
-}
-
-void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
- xfs_agnumber_t agno,
- int idx)
-{
- xfs_mount_t *mp;
- xfs_perag_busy_t *list;
-
- mp = tp->t_mountp;
-
- spin_lock(&mp->m_perag[agno].pagb_lock);
- list = mp->m_perag[agno].pagb_list;
-
- ASSERT(idx < XFS_PAGB_NUM_SLOTS);
-
- trace_xfs_alloc_unbusy(mp, agno, idx, list[idx].busy_tp == tp);
-
- if (list[idx].busy_tp == tp) {
- list[idx].busy_tp = NULL;
- mp->m_perag[agno].pagb_count--;
+ new->agno = agno;
+ new->bno = bno;
+ new->length = len;
+ new->tid = xfs_log_get_trans_ident(tp);
+
+ INIT_LIST_HEAD(&new->list);
+
+ /* trace before insert to be able to see failed inserts */
+ trace_xfs_alloc_busy(tp, agno, bno, len, 0);
+
+ pag = xfs_perag_get(tp->t_mountp, new->agno);
+restart:
+ spin_lock(&pag->pagb_lock);
+ rbp = &pag->pagb_tree.rb_node;
+ parent = NULL;
+ busyp = NULL;
+ match = 0;
+ while (*rbp && match >= 0) {
+ parent = *rbp;
+ busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
+
+ if (new->bno < busyp->bno) {
+ /* may overlap, but exact start block is lower */
+ rbp = &(*rbp)->rb_left;
+ if (new->bno + new->length > busyp->bno)
+ match = busyp->tid == new->tid ? 1 : -1;
+ } else if (new->bno > busyp->bno) {
+ /* may overlap, but exact start block is higher */
+ rbp = &(*rbp)->rb_right;
+ if (bno < busyp->bno + busyp->length)
+ match = busyp->tid == new->tid ? 1 : -1;
+ } else {
+ match = busyp->tid == new->tid ? 1 : -1;
+ break;
+ }
+ }
+ if (match < 0) {
+ /* overlap marked busy in different transaction */
+ spin_unlock(&pag->pagb_lock);
+ xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
+ goto restart;
}
+ if (match > 0) {
+ /*
+ * overlap marked busy in same transaction. Update if exact
+ * start block match, otherwise combine the busy extents into
+ * a single range.
+ */
+ if (busyp->bno == new->bno) {
+ busyp->length = max(busyp->length, new->length);
+ spin_unlock(&pag->pagb_lock);
+ ASSERT(tp->t_flags & XFS_TRANS_SYNC);
+ xfs_perag_put(pag);
+ kmem_free(new);
+ return;
+ }
+ rb_erase(&busyp->rb_node, &pag->pagb_tree);
+ new->length = max(busyp->bno + busyp->length,
+ new->bno + new->length) -
+ min(busyp->bno, new->bno);
+ new->bno = min(busyp->bno, new->bno);
+ } else
+ busyp = NULL;
- spin_unlock(&mp->m_perag[agno].pagb_lock);
-}
+ rb_link_node(&new->rb_node, parent, rbp);
+ rb_insert_color(&new->rb_node, &pag->pagb_tree);
+ list_add(&new->list, &tp->t_busy);
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
+ kmem_free(busyp);
+}
/*
- * If we find the extent in the busy list, force the log out to get the
- * extent out of the busy list so the caller can use it straight away.
+ * Search for a busy extent within the range of the extent we are about to
+ * allocate. You need to be holding the busy extent tree lock when calling
+ * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
+ * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
+ * match. This is done so that a non-zero return indicates an overlap that
+ * will require a synchronous transaction, but it can still be
+ * used to distinguish between a partial or exact match.
*/
-STATIC void
-xfs_alloc_search_busy(xfs_trans_t *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len)
+static int
+xfs_alloc_busy_search(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len)
{
- xfs_mount_t *mp;
- xfs_perag_busy_t *bsy;
- xfs_agblock_t uend, bend;
- xfs_lsn_t lsn = 0;
- int cnt;
-
- mp = tp->t_mountp;
+ struct xfs_perag *pag;
+ struct rb_node *rbp;
+ struct xfs_busy_extent *busyp;
+ int match = 0;
+
+ pag = xfs_perag_get(mp, agno);
+ spin_lock(&pag->pagb_lock);
+
+ rbp = pag->pagb_tree.rb_node;
+
+ /* find closest start bno overlap */
+ while (rbp) {
+ busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
+ if (bno < busyp->bno) {
+ /* may overlap, but exact start block is lower */
+ if (bno + len > busyp->bno)
+ match = -1;
+ rbp = rbp->rb_left;
+ } else if (bno > busyp->bno) {
+ /* may overlap, but exact start block is higher */
+ if (bno < busyp->bno + busyp->length)
+ match = -1;
+ rbp = rbp->rb_right;
+ } else {
+ /* bno matches busyp, length determines exact match */
+ match = (busyp->length == len) ? 1 : -1;
+ break;
+ }
+ }
+ spin_unlock(&pag->pagb_lock);
+ trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
+ xfs_perag_put(pag);
+ return match;
+}
- spin_lock(&mp->m_perag[agno].pagb_lock);
+void
+xfs_alloc_busy_clear(
+ struct xfs_mount *mp,
+ struct xfs_busy_extent *busyp)
+{
+ struct xfs_perag *pag;
- uend = bno + len - 1;
+ trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
+ busyp->length);
- /*
- * search pagb_list for this slot, skipping open slots. We have to
- * search the entire array as there may be multiple overlaps and
- * we have to get the most recent LSN for the log force to push out
- * all the transactions that span the range.
- */
- for (cnt = 0; cnt < mp->m_perag[agno].pagb_count; cnt++) {
- bsy = &mp->m_perag[agno].pagb_list[cnt];
- if (!bsy->busy_tp)
- continue;
+ ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
+ busyp->length) == 1);
- bend = bsy->busy_start + bsy->busy_length - 1;
- if (bno > bend || uend < bsy->busy_start)
- continue;
+ list_del_init(&busyp->list);
- /* (start1,length1) within (start2, length2) */
- if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
- lsn = bsy->busy_tp->t_commit_lsn;
- }
- spin_unlock(&mp->m_perag[agno].pagb_lock);
- trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
+ pag = xfs_perag_get(mp, busyp->agno);
+ spin_lock(&pag->pagb_lock);
+ rb_erase(&busyp->rb_node, &pag->pagb_tree);
+ spin_unlock(&pag->pagb_lock);
+ xfs_perag_put(pag);
- /*
- * If a block was found, force the log through the LSN of the
- * transaction that freed the block
- */
- if (lsn)
- xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
+ kmem_free(busyp);
}