aboutsummaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/acl.c6
-rw-r--r--fs/btrfs/async-thread.c260
-rw-r--r--fs/btrfs/async-thread.h12
-rw-r--r--fs/btrfs/btrfs_inode.h9
-rw-r--r--fs/btrfs/compression.c9
-rw-r--r--fs/btrfs/ctree.c127
-rw-r--r--fs/btrfs/ctree.h129
-rw-r--r--fs/btrfs/dir-item.c47
-rw-r--r--fs/btrfs/disk-io.c262
-rw-r--r--fs/btrfs/export.c133
-rw-r--r--fs/btrfs/extent-tree.c2694
-rw-r--r--fs/btrfs/extent_io.c416
-rw-r--r--fs/btrfs/extent_io.h29
-rw-r--r--fs/btrfs/extent_map.c103
-rw-r--r--fs/btrfs/extent_map.h5
-rw-r--r--fs/btrfs/file.c76
-rw-r--r--fs/btrfs/free-space-cache.c1066
-rw-r--r--fs/btrfs/free-space-cache.h8
-rw-r--r--fs/btrfs/inode-item.c4
-rw-r--r--fs/btrfs/inode-map.c93
-rw-r--r--fs/btrfs/inode.c979
-rw-r--r--fs/btrfs/ioctl.c406
-rw-r--r--fs/btrfs/ioctl.h3
-rw-r--r--fs/btrfs/ordered-data.c127
-rw-r--r--fs/btrfs/ordered-data.h7
-rw-r--r--fs/btrfs/orphan.c20
-rw-r--r--fs/btrfs/print-tree.c6
-rw-r--r--fs/btrfs/relocation.c283
-rw-r--r--fs/btrfs/root-tree.c138
-rw-r--r--fs/btrfs/super.c8
-rw-r--r--fs/btrfs/transaction.c106
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/tree-log.c29
-rw-r--r--fs/btrfs/volumes.c167
-rw-r--r--fs/btrfs/volumes.h3
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/btrfs/zlib.c6
37 files changed, 5126 insertions, 2653 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index f128427b995b..69b355ae7f49 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -27,7 +27,7 @@
#include "btrfs_inode.h"
#include "xattr.h"
-#ifdef CONFIG_FS_POSIX_ACL
+#ifdef CONFIG_BTRFS_POSIX_ACL
static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
{
@@ -313,7 +313,7 @@ struct xattr_handler btrfs_xattr_acl_access_handler = {
.set = btrfs_xattr_acl_access_set,
};
-#else /* CONFIG_FS_POSIX_ACL */
+#else /* CONFIG_BTRFS_POSIX_ACL */
int btrfs_acl_chmod(struct inode *inode)
{
@@ -325,4 +325,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
return 0;
}
-#endif /* CONFIG_FS_POSIX_ACL */
+#endif /* CONFIG_BTRFS_POSIX_ACL */
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 7f88628a1a72..282ca085c2fb 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -48,6 +48,9 @@ struct btrfs_worker_thread {
/* number of things on the pending list */
atomic_t num_pending;
+ /* reference counter for this struct */
+ atomic_t refs;
+
unsigned long sequence;
/* protects the pending list. */
@@ -71,7 +74,12 @@ static void check_idle_worker(struct btrfs_worker_thread *worker)
unsigned long flags;
spin_lock_irqsave(&worker->workers->lock, flags);
worker->idle = 1;
- list_move(&worker->worker_list, &worker->workers->idle_list);
+
+ /* the list may be empty if the worker is just starting */
+ if (!list_empty(&worker->worker_list)) {
+ list_move(&worker->worker_list,
+ &worker->workers->idle_list);
+ }
spin_unlock_irqrestore(&worker->workers->lock, flags);
}
}
@@ -87,23 +95,49 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
unsigned long flags;
spin_lock_irqsave(&worker->workers->lock, flags);
worker->idle = 0;
- list_move_tail(&worker->worker_list,
- &worker->workers->worker_list);
+
+ if (!list_empty(&worker->worker_list)) {
+ list_move_tail(&worker->worker_list,
+ &worker->workers->worker_list);
+ }
spin_unlock_irqrestore(&worker->workers->lock, flags);
}
}
-static noinline int run_ordered_completions(struct btrfs_workers *workers,
- struct btrfs_work *work)
+static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
{
+ struct btrfs_workers *workers = worker->workers;
unsigned long flags;
+ rmb();
+ if (!workers->atomic_start_pending)
+ return;
+
+ spin_lock_irqsave(&workers->lock, flags);
+ if (!workers->atomic_start_pending)
+ goto out;
+
+ workers->atomic_start_pending = 0;
+ if (workers->num_workers >= workers->max_workers)
+ goto out;
+
+ spin_unlock_irqrestore(&workers->lock, flags);
+ btrfs_start_workers(workers, 1);
+ return;
+
+out:
+ spin_unlock_irqrestore(&workers->lock, flags);
+}
+
+static noinline int run_ordered_completions(struct btrfs_workers *workers,
+ struct btrfs_work *work)
+{
if (!workers->ordered)
return 0;
set_bit(WORK_DONE_BIT, &work->flags);
- spin_lock_irqsave(&workers->lock, flags);
+ spin_lock(&workers->order_lock);
while (1) {
if (!list_empty(&workers->prio_order_list)) {
@@ -126,45 +160,118 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
break;
- spin_unlock_irqrestore(&workers->lock, flags);
+ spin_unlock(&workers->order_lock);
work->ordered_func(work);
/* now take the lock again and call the freeing code */
- spin_lock_irqsave(&workers->lock, flags);
+ spin_lock(&workers->order_lock);
list_del(&work->order_list);
work->ordered_free(work);
}
- spin_unlock_irqrestore(&workers->lock, flags);
+ spin_unlock(&workers->order_lock);
return 0;
}
+static void put_worker(struct btrfs_worker_thread *worker)
+{
+ if (atomic_dec_and_test(&worker->refs))
+ kfree(worker);
+}
+
+static int try_worker_shutdown(struct btrfs_worker_thread *worker)
+{
+ int freeit = 0;
+
+ spin_lock_irq(&worker->lock);
+ spin_lock(&worker->workers->lock);
+ if (worker->workers->num_workers > 1 &&
+ worker->idle &&
+ !worker->working &&
+ !list_empty(&worker->worker_list) &&
+ list_empty(&worker->prio_pending) &&
+ list_empty(&worker->pending) &&
+ atomic_read(&worker->num_pending) == 0) {
+ freeit = 1;
+ list_del_init(&worker->worker_list);
+ worker->workers->num_workers--;
+ }
+ spin_unlock(&worker->workers->lock);
+ spin_unlock_irq(&worker->lock);
+
+ if (freeit)
+ put_worker(worker);
+ return freeit;
+}
+
+static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
+ struct list_head *prio_head,
+ struct list_head *head)
+{
+ struct btrfs_work *work = NULL;
+ struct list_head *cur = NULL;
+
+ if(!list_empty(prio_head))
+ cur = prio_head->next;
+
+ smp_mb();
+ if (!list_empty(&worker->prio_pending))
+ goto refill;
+
+ if (!list_empty(head))
+ cur = head->next;
+
+ if (cur)
+ goto out;
+
+refill:
+ spin_lock_irq(&worker->lock);
+ list_splice_tail_init(&worker->prio_pending, prio_head);
+ list_splice_tail_init(&worker->pending, head);
+
+ if (!list_empty(prio_head))
+ cur = prio_head->next;
+ else if (!list_empty(head))
+ cur = head->next;
+ spin_unlock_irq(&worker->lock);
+
+ if (!cur)
+ goto out_fail;
+
+out:
+ work = list_entry(cur, struct btrfs_work, list);
+
+out_fail:
+ return work;
+}
+
/*
* main loop for servicing work items
*/
static int worker_loop(void *arg)
{
struct btrfs_worker_thread *worker = arg;
- struct list_head *cur;
+ struct list_head head;
+ struct list_head prio_head;
struct btrfs_work *work;
+
+ INIT_LIST_HEAD(&head);
+ INIT_LIST_HEAD(&prio_head);
+
do {
- spin_lock_irq(&worker->lock);
-again_locked:
+again:
while (1) {
- if (!list_empty(&worker->prio_pending))
- cur = worker->prio_pending.next;
- else if (!list_empty(&worker->pending))
- cur = worker->pending.next;
- else
+
+
+ work = get_next_work(worker, &prio_head, &head);
+ if (!work)
break;
- work = list_entry(cur, struct btrfs_work, list);
list_del(&work->list);
clear_bit(WORK_QUEUED_BIT, &work->flags);
work->worker = worker;
- spin_unlock_irq(&worker->lock);
work->func(work);
@@ -175,9 +282,13 @@ again_locked:
*/
run_ordered_completions(worker->workers, work);
- spin_lock_irq(&worker->lock);
- check_idle_worker(worker);
+ check_pending_worker_creates(worker);
+
}
+
+ spin_lock_irq(&worker->lock);
+ check_idle_worker(worker);
+
if (freezing(current)) {
worker->working = 0;
spin_unlock_irq(&worker->lock);
@@ -216,8 +327,10 @@ again_locked:
spin_lock_irq(&worker->lock);
set_current_state(TASK_INTERRUPTIBLE);
if (!list_empty(&worker->pending) ||
- !list_empty(&worker->prio_pending))
- goto again_locked;
+ !list_empty(&worker->prio_pending)) {
+ spin_unlock_irq(&worker->lock);
+ goto again;
+ }
/*
* this makes sure we get a wakeup when someone
@@ -226,8 +339,13 @@ again_locked:
worker->working = 0;
spin_unlock_irq(&worker->lock);
- if (!kthread_should_stop())
- schedule();
+ if (!kthread_should_stop()) {
+ schedule_timeout(HZ * 120);
+ if (!worker->working &&
+ try_worker_shutdown(worker)) {
+ return 0;
+ }
+ }
}
__set_current_state(TASK_RUNNING);
}
@@ -242,16 +360,30 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
{
struct list_head *cur;
struct btrfs_worker_thread *worker;
+ int can_stop;
+ spin_lock_irq(&workers->lock);
list_splice_init(&workers->idle_list, &workers->worker_list);
while (!list_empty(&workers->worker_list)) {
cur = workers->worker_list.next;
worker = list_entry(cur, struct btrfs_worker_thread,
worker_list);
- kthread_stop(worker->task);
- list_del(&worker->worker_list);
- kfree(worker);
+
+ atomic_inc(&worker->refs);
+ workers->num_workers -= 1;
+ if (!list_empty(&worker->worker_list)) {
+ list_del_init(&worker->worker_list);
+ put_worker(worker);
+ can_stop = 1;
+ } else
+ can_stop = 0;
+ spin_unlock_irq(&workers->lock);
+ if (can_stop)
+ kthread_stop(worker->task);
+ spin_lock_irq(&workers->lock);
+ put_worker(worker);
}
+ spin_unlock_irq(&workers->lock);
return 0;
}
@@ -266,10 +398,13 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
INIT_LIST_HEAD(&workers->order_list);
INIT_LIST_HEAD(&workers->prio_order_list);
spin_lock_init(&workers->lock);
+ spin_lock_init(&workers->order_lock);
workers->max_workers = max;
workers->idle_thresh = 32;
workers->name = name;
workers->ordered = 0;
+ workers->atomic_start_pending = 0;
+ workers->atomic_worker_start = 0;
}
/*
@@ -293,17 +428,18 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
INIT_LIST_HEAD(&worker->prio_pending);
INIT_LIST_HEAD(&worker->worker_list);
spin_lock_init(&worker->lock);
+
atomic_set(&worker->num_pending, 0);
+ atomic_set(&worker->refs, 1);
worker->workers = workers;
worker->task = kthread_run(worker_loop, worker,
"btrfs-%s-%d", workers->name,
workers->num_workers + i);
if (IS_ERR(worker->task)) {
- kfree(worker);
ret = PTR_ERR(worker->task);
+ kfree(worker);
goto fail;
}
-
spin_lock_irq(&workers->lock);
list_add_tail(&worker->worker_list, &workers->idle_list);
worker->idle = 1;
@@ -350,7 +486,6 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
*/
next = workers->worker_list.next;
worker = list_entry(next, struct btrfs_worker_thread, worker_list);
- atomic_inc(&worker->num_pending);
worker->sequence++;
if (worker->sequence % workers->idle_thresh == 0)
@@ -367,28 +502,18 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
{
struct btrfs_worker_thread *worker;
unsigned long flags;
+ struct list_head *fallback;
again:
spin_lock_irqsave(&workers->lock, flags);
worker = next_worker(workers);
- spin_unlock_irqrestore(&workers->lock, flags);
if (!worker) {
- spin_lock_irqsave(&workers->lock, flags);
if (workers->num_workers >= workers->max_workers) {
- struct list_head *fallback = NULL;
- /*
- * we have failed to find any workers, just
- * return the force one
- */
- if (!list_empty(&workers->worker_list))
- fallback = workers->worker_list.next;
- if (!list_empty(&workers->idle_list))
- fallback = workers->idle_list.next;
- BUG_ON(!fallback);
- worker = list_entry(fallback,
- struct btrfs_worker_thread, worker_list);
- spin_unlock_irqrestore(&workers->lock, flags);
+ goto fallback;
+ } else if (workers->atomic_worker_start) {
+ workers->atomic_start_pending = 1;
+ goto fallback;
} else {
spin_unlock_irqrestore(&workers->lock, flags);
/* we're below the limit, start another worker */
@@ -396,6 +521,28 @@ again:
goto again;
}
}
+ goto found;
+
+fallback:
+ fallback = NULL;
+ /*
+ * we have failed to find any workers, just
+ * return the first one we can find.
+ */
+ if (!list_empty(&workers->worker_list))
+ fallback = workers->worker_list.next;
+ if (!list_empty(&workers->idle_list))
+ fallback = workers->idle_list.next;
+ BUG_ON(!fallback);
+ worker = list_entry(fallback,
+ struct btrfs_worker_thread, worker_list);
+found:
+ /*
+ * this makes sure the worker doesn't exit before it is placed
+ * onto a busy/idle list
+ */
+ atomic_inc(&worker->num_pending);
+ spin_unlock_irqrestore(&workers->lock, flags);
return worker;
}
@@ -424,20 +571,20 @@ int btrfs_requeue_work(struct btrfs_work *work)
* list
*/
if (worker->idle) {
- spin_lock_irqsave(&worker->workers->lock, flags);
+ spin_lock(&worker->workers->lock);
worker->idle = 0;
list_move_tail(&worker->worker_list,
- &worker->workers->worker_list);
- spin_unlock_irqrestore(&worker->workers->lock, flags);
+ &worker->workers->worker_list);
+ spin_unlock(&worker->workers->lock);
}
if (!worker->working) {
wake = 1;
worker->working = 1;
}
- spin_unlock_irqrestore(&worker->lock, flags);
if (wake)
wake_up_process(worker->task);
+ spin_unlock_irqrestore(&worker->lock, flags);
out:
return 0;
@@ -463,14 +610,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
worker = find_worker(workers);
if (workers->ordered) {
- spin_lock_irqsave(&workers->lock, flags);
+ /*
+ * you're not allowed to do ordered queues from an
+ * interrupt handler
+ */
+ spin_lock(&workers->order_lock);
if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
list_add_tail(&work->order_list,
&workers->prio_order_list);
} else {
list_add_tail(&work->order_list, &workers->order_list);
}
- spin_unlock_irqrestore(&workers->lock, flags);
+ spin_unlock(&workers->order_lock);
} else {
INIT_LIST_HEAD(&work->order_list);
}
@@ -481,7 +632,6 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
list_add_tail(&work->list, &worker->prio_pending);
else
list_add_tail(&work->list, &worker->pending);
- atomic_inc(&worker->num_pending);
check_busy_worker(worker);
/*
@@ -492,10 +642,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
wake = 1;
worker->working = 1;
- spin_unlock_irqrestore(&worker->lock, flags);
-
if (wake)
wake_up_process(worker->task);
+ spin_unlock_irqrestore(&worker->lock, flags);
+
out:
return 0;
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1b511c109db6..fc089b95ec14 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -73,6 +73,15 @@ struct btrfs_workers {
/* force completions in the order they were queued */
int ordered;
+ /* more workers required, but in an interrupt handler */
+ int atomic_start_pending;
+
+ /*
+ * are we allowed to sleep while starting workers or are we required
+ * to start them at a later time?
+ */
+ int atomic_worker_start;
+
/* list with all the work threads. The workers on the idle thread
* may be actively servicing jobs, but they haven't yet hit the
* idle thresh limit above.
@@ -90,6 +99,9 @@ struct btrfs_workers {
/* lock for finding the next worker thread to queue on */
spinlock_t lock;
+ /* lock for the ordered lists */
+ spinlock_t order_lock;
+
/* extra name for this worker, used for current->name */
char *name;
};
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ea1ea0af8c0e..a54d354cefcb 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -128,6 +128,14 @@ struct btrfs_inode {
u64 last_unlink_trans;
/*
+ * These two counters are for delalloc metadata reservations. We keep
+ * track of how many extents we've accounted for vs how many extents we
+ * have.
+ */
+ int delalloc_reserved_extents;
+ int delalloc_extents;
+
+ /*
* ordered_data_close is set by truncate when a file that used
* to have good data has been truncated to zero. When it is set
* the btrfs file release call will add this inode to the
@@ -138,6 +146,7 @@ struct btrfs_inode {
* of these.
*/
unsigned ordered_data_close:1;
+ unsigned dummy_inode:1;
struct inode vfs_inode;
};
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index de1e2fd32080..a11a32058b50 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -26,7 +26,6 @@
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
-#include <linux/smp_lock.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
#include <linux/swap.h>
@@ -507,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
*/
set_page_extent_mapped(page);
lock_extent(tree, last_offset, end, GFP_NOFS);
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, last_offset,
PAGE_CACHE_SIZE);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
if (!em || last_offset < em->start ||
(last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
@@ -594,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
em_tree = &BTRFS_I(inode)->extent_tree;
/* we need the actual starting offset of this extent in the file */
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree,
page_offset(bio->bi_io_vec->bv_page),
PAGE_CACHE_SIZE);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
compressed_len = em->block_len;
cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 60a45f3a4e91..ec96f3a6d536 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -557,19 +557,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
btrfs_disk_key_to_cpu(&k1, disk);
- if (k1.objectid > k2->objectid)
- return 1;
- if (k1.objectid < k2->objectid)
- return -1;
- if (k1.type > k2->type)
- return 1;
- if (k1.type < k2->type)
- return -1;
- if (k1.offset > k2->offset)
- return 1;
- if (k1.offset < k2->offset)
- return -1;
- return 0;
+ return btrfs_comp_cpu_keys(&k1, k2);
}
/*
@@ -1052,9 +1040,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
return 0;
- if (btrfs_header_nritems(mid) > 2)
- return 0;
-
if (btrfs_header_nritems(mid) < 2)
err_on_enospc = 1;
@@ -1701,6 +1686,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
struct extent_buffer *b;
int slot;
int ret;
+ int err;
int level;
int lowest_unlock = 1;
u8 lowest_level = 0;
@@ -1737,8 +1723,6 @@ again:
p->locks[level] = 1;
if (cow) {
- int wret;
-
/*
* if we don't really need to cow this block
* then we don't want to set the path blocking,
@@ -1749,12 +1733,12 @@ again:
btrfs_set_path_blocking(p);
- wret = btrfs_cow_block(trans, root, b,
- p->nodes[level + 1],
- p->slots[level + 1], &b);
- if (wret) {
+ err = btrfs_cow_block(trans, root, b,
+ p->nodes[level + 1],
+ p->slots[level + 1], &b);
+ if (err) {
free_extent_buffer(b);
- ret = wret;
+ ret = err;
goto done;
}
}
@@ -1793,41 +1777,45 @@ cow_done:
ret = bin_search(b, key, level, &slot);
if (level != 0) {
- if (ret && slot > 0)
+ int dec = 0;
+ if (ret && slot > 0) {
+ dec = 1;
slot -= 1;
+ }
p->slots[level] = slot;
- ret = setup_nodes_for_search(trans, root, p, b, level,
+ err = setup_nodes_for_search(trans, root, p, b, level,
ins_len);
- if (ret == -EAGAIN)
+ if (err == -EAGAIN)
goto again;
- else if (ret)
+ if (err) {
+ ret = err;
goto done;
+ }
b = p->nodes[level];
slot = p->slots[level];
unlock_up(p, level, lowest_unlock);
- /* this is only true while dropping a snapshot */
if (level == lowest_level) {
- ret = 0;
+ if (dec)
+ p->slots[level]++;
goto done;
}
- ret = read_block_for_search(trans, root, p,
+ err = read_block_for_search(trans, root, p,
&b, level, slot, key);
- if (ret == -EAGAIN)
+ if (err == -EAGAIN)
goto again;
-
- if (ret == -EIO)
+ if (err) {
+ ret = err;
goto done;
+ }
if (!p->skip_locking) {
- int lret;
-
btrfs_clear_path_blocking(p, NULL);
- lret = btrfs_try_spin_lock(b);
+ err = btrfs_try_spin_lock(b);
- if (!lret) {
+ if (!err) {
btrfs_set_path_blocking(p);
btrfs_tree_lock(b);
btrfs_clear_path_blocking(p, b);
@@ -1837,16 +1825,14 @@ cow_done:
p->slots[level] = slot;
if (ins_len > 0 &&
btrfs_leaf_free_space(root, b) < ins_len) {
- int sret;
-
btrfs_set_path_blocking(p);
- sret = split_leaf(trans, root, key,
- p, ins_len, ret == 0);
+ err = split_leaf(trans, root, key,
+ p, ins_len, ret == 0);
btrfs_clear_path_blocking(p, NULL);
- BUG_ON(sret > 0);
- if (sret) {
- ret = sret;
+ BUG_ON(err > 0);
+ if (err) {
+ ret = err;
goto done;
}
}
@@ -2867,6 +2853,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
int split;
int num_doubles = 0;
+ l = path->nodes[0];
+ slot = path->slots[0];
+ if (extend && data_size + btrfs_item_size_nr(l, slot) +
+ sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
+ return -EOVERFLOW;
+
/* first try to make some room by pushing left and right */
if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
wret = push_leaf_right(trans, root, path, data_size, 0);
@@ -3807,7 +3799,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
/* delete the leaf if it is mostly empty */
- if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) {
+ if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
/* push_leaf_left fixes the path.
* make sure the path still points to our leaf
* for possible call to del_ptr below
@@ -4042,10 +4034,9 @@ out:
* calling this function.
*/
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
- struct btrfs_key *key, int lowest_level,
+ struct btrfs_key *key, int level,
int cache_only, u64 min_trans)
{
- int level = lowest_level;
int slot;
struct extent_buffer *c;
@@ -4058,11 +4049,40 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
c = path->nodes[level];
next:
if (slot >= btrfs_header_nritems(c)) {
- level++;
- if (level == BTRFS_MAX_LEVEL)
+ int ret;
+ int orig_lowest;
+ struct btrfs_key cur_key;
+ if (level + 1 >= BTRFS_MAX_LEVEL ||
+ !path->nodes[level + 1])
return 1;
- continue;
+
+ if (path->locks[level + 1]) {
+ level++;
+ continue;
+ }
+
+ slot = btrfs_header_nritems(c) - 1;
+ if (level == 0)
+ btrfs_item_key_to_cpu(c, &cur_key, slot);
+ else
+ btrfs_node_key_to_cpu(c, &cur_key, slot);
+
+ orig_lowest = path->lowest_level;
+ btrfs_release_path(root, path);
+ path->lowest_level = level;
+ ret = btrfs_search_slot(NULL, root, &cur_key, path,
+ 0, 0);
+ path->lowest_level = orig_lowest;
+ if (ret < 0)
+ return ret;
+
+ c = path->nodes[level];
+ slot = path->slots[level];
+ if (ret == 0)
+ slot++;
+ goto next;
}
+
if (level == 0)
btrfs_item_key_to_cpu(c, key, slot);
else {
@@ -4146,7 +4166,8 @@ again:
* advance the path if there are now more items available.
*/
if (nritems > 0 && path->slots[0] < nritems - 1) {
- path->slots[0]++;
+ if (ret == 0)
+ path->slots[0]++;
ret = 0;
goto done;
}
@@ -4278,10 +4299,10 @@ int btrfs_previous_item(struct btrfs_root *root,
path->slots[0]--;
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- if (found_key.type == type)
- return 0;
if (found_key.objectid < min_objectid)
break;
+ if (found_key.type == type)
+ return 0;
if (found_key.objectid == min_objectid &&
found_key.type < type)
break;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2779c2f5360a..dd8ced9814c4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -114,6 +114,10 @@ struct btrfs_ordered_sum;
*/
#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
+#define BTRFS_BTREE_INODE_OBJECTID 1
+
+#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
+
/*
* we can actually store much bigger names, but lets not confuse the rest
* of linux
@@ -481,7 +485,7 @@ struct btrfs_shared_data_ref {
struct btrfs_extent_inline_ref {
u8 type;
- u64 offset;
+ __le64 offset;
} __attribute__ ((__packed__));
/* old style backrefs item */
@@ -670,18 +674,20 @@ struct btrfs_space_info {
u64 bytes_reserved; /* total bytes the allocator has reserved for
current allocations */
u64 bytes_readonly; /* total bytes that are read only */
-
- /* delalloc accounting */
- u64 bytes_delalloc; /* number of bytes reserved for allocation,
- this space is not necessarily reserved yet
- by the allocator */
+ u64 bytes_super; /* total bytes reserved for the super blocks */
+ u64 bytes_root; /* the number of bytes needed to commit a
+ transaction */
u64 bytes_may_use; /* number of bytes that may be used for
- delalloc */
+ delalloc/allocations */
+ u64 bytes_delalloc; /* number of bytes currently reserved for
+ delayed allocation */
int full; /* indicates that we cannot allocate any more
chunks for this space */
int force_alloc; /* set if we need to force a chunk alloc for
this space */
+ int force_delalloc; /* make people start doing filemap_flush until
+ we're under a threshold */
struct list_head list;
@@ -689,6 +695,10 @@ struct btrfs_space_info {
struct list_head block_groups;
spinlock_t lock;
struct rw_semaphore groups_sem;
+ atomic_t caching_threads;
+
+ int allocating_chunk;
+ wait_queue_head_t wait;
};
/*
@@ -707,6 +717,9 @@ struct btrfs_free_cluster {
/* first extent starting offset */
u64 window_start;
+ /* if this cluster simply points at a bitmap in the block group */
+ bool points_to_bitmap;
+
struct btrfs_block_group_cache *block_group;
/*
* when a cluster is allocated from a block group, we put the
@@ -716,24 +729,48 @@ struct btrfs_free_cluster {
struct list_head block_group_list;
};
+enum btrfs_caching_type {
+ BTRFS_CACHE_NO = 0,
+ BTRFS_CACHE_STARTED = 1,
+ BTRFS_CACHE_FINISHED = 2,
+};
+
+struct btrfs_caching_control {
+ struct list_head list;
+ struct mutex mutex;
+ wait_queue_head_t wait;
+ struct btrfs_block_group_cache *block_group;
+ u64 progress;
+ atomic_t count;
+};
+
struct btrfs_block_group_cache {
struct btrfs_key key;
struct btrfs_block_group_item item;
+ struct btrfs_fs_info *fs_info;
spinlock_t lock;
- struct mutex cache_mutex;
u64 pinned;
u64 reserved;
+ u64 bytes_super;
u64 flags;
- int cached;
+ u64 sectorsize;
+ int extents_thresh;
+ int free_extents;
+ int total_bitmaps;
int ro;
int dirty;
+ /* cache tracking stuff */
+ int cached;
+ struct btrfs_caching_control *caching_ctl;
+ u64 last_byte_to_unpin;
+
struct btrfs_space_info *space_info;
/* free space cache stuff */
spinlock_t tree_lock;
- struct rb_root free_space_bytes;
struct rb_root free_space_offset;
+ u64 free_space;
/* block group cache stuff */
struct rb_node cache_node;
@@ -765,13 +802,16 @@ struct btrfs_fs_info {
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
+
+ spinlock_t fs_roots_radix_lock;
struct radix_tree_root fs_roots_radix;
/* block group cache stuff */
spinlock_t block_group_cache_lock;
struct rb_root block_group_cache_tree;
- struct extent_io_tree pinned_extents;
+ struct extent_io_tree freed_extents[2];
+ struct extent_io_tree *pinned_extents;
/* logical->physical extent mapping */
struct btrfs_mapping_tree mapping_tree;
@@ -805,10 +845,7 @@ struct btrfs_fs_info {
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
struct mutex chunk_mutex;
- struct mutex drop_mutex;
struct mutex volume_mutex;
- struct mutex tree_reloc_mutex;
-
/*
* this protects the ordered operations list only while we are
* processing all of the entries on it. This way we make
@@ -817,10 +854,16 @@ struct btrfs_fs_info {
* before jumping into the main commit.
*/
struct mutex ordered_operations_mutex;
+ struct rw_semaphore extent_commit_sem;
+
+ struct rw_semaphore subvol_sem;
+
+ struct srcu_struct subvol_srcu;
struct list_head trans_list;
struct list_head hashers;
struct list_head dead_roots;
+ struct list_head caching_block_groups;
atomic_t nr_async_submits;
atomic_t async_submit_draining;
@@ -978,10 +1021,12 @@ struct btrfs_root {
u32 stripesize;
u32 type;
- u64 highest_inode;
- u64 last_inode_alloc;
+
+ u64 highest_objectid;
int ref_cows;
int track_dirty;
+ int in_radix;
+
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
@@ -1902,8 +1947,8 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_update_pinned_extents(struct btrfs_root *root,
- u64 bytenr, u64 num, int pin);
+int btrfs_pin_extent(struct btrfs_root *root,
+ u64 bytenr, u64 num, int reserved);
int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *leaf);
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
@@ -1953,9 +1998,10 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 root_objectid, u64 owner, u64 offset);
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_io_tree *unpin);
+ struct btrfs_root *root);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -1966,6 +2012,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
int btrfs_read_block_groups(struct btrfs_root *root);
+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
int btrfs_make_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytes_used,
u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -1979,7 +2026,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
-int btrfs_check_metadata_free_space(struct btrfs_root *root);
+int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
+int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
+int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items);
+int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items);
int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
u64 bytes);
void btrfs_free_reserved_data_space(struct btrfs_root *root,
@@ -2074,20 +2126,22 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
- *root);
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
struct extent_buffer *parent);
/* root-item.c */
int btrfs_find_root_ref(struct btrfs_root *tree_root,
- struct btrfs_path *path,
- u64 root_id, u64 ref_id);
+ struct btrfs_path *path,
+ u64 root_id, u64 ref_id);
int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root,
- u64 root_id, u8 type, u64 ref_id,
- u64 dirid, u64 sequence,
+ u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
+ const char *name, int name_len);
+int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *tree_root,
+ u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
const char *name, int name_len);
int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_key *key);
@@ -2102,6 +2156,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
int btrfs_search_root(struct btrfs_root *root, u64 search_start,
u64 *found_objectid);
int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
+int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
int btrfs_set_root_node(struct btrfs_root_item *item,
struct extent_buffer *node);
/* dir-item.c */
@@ -2120,6 +2175,10 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 dir,
u64 objectid, const char *name, int name_len,
int mod);
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 dirid,
+ const char *name, int name_len);
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
struct btrfs_path *path,
const char *name, int name_len);
@@ -2142,6 +2201,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset);
int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset);
+int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
/* inode-map.c */
int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
@@ -2214,6 +2274,10 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
int btrfs_add_link(struct btrfs_trans_handle *trans,
struct inode *parent_inode, struct inode *inode,
const char *name, int name_len, int add_backref, u64 index);
+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir, u64 objectid,
+ const char *name, int name_len);
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode, u64 new_size,
@@ -2224,7 +2288,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
int btrfs_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *new_root, struct dentry *dentry,
+ struct btrfs_root *new_root,
u64 new_dirid, u64 alloc_hint);
int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
size_t size, struct bio *bio, unsigned long bio_flags);
@@ -2240,6 +2304,7 @@ int btrfs_write_inode(struct inode *inode, int wait);
void btrfs_dirty_inode(struct inode *inode);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
+void btrfs_drop_inode(struct inode *inode);
int btrfs_init_cachep(void);
void btrfs_destroy_cachep(void);
long btrfs_ioctl_trans_end(struct file *file);
@@ -2257,6 +2322,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
void btrfs_orphan_cleanup(struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t size);
+int btrfs_invalidate_inodes(struct btrfs_root *root);
+extern struct dentry_operations btrfs_dentry_operations;
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -2268,11 +2335,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
int skip_pinned);
int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
-extern struct file_operations btrfs_file_operations;
+extern const struct file_operations btrfs_file_operations;
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
u64 start, u64 end, u64 locked_end,
- u64 inline_limit, u64 *hint_block);
+ u64 inline_limit, u64 *hint_block, int drop_cache);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode, u64 start, u64 end);
@@ -2299,7 +2366,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
int btrfs_sync_fs(struct super_block *sb, int wait);
/* acl.c */
-#ifdef CONFIG_FS_POSIX_ACL
+#ifdef CONFIG_BTRFS_POSIX_ACL
int btrfs_check_acl(struct inode *inode, int mask);
#else
#define btrfs_check_acl NULL
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 1d70236ba00c..f3a6075519cc 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -281,6 +281,53 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
return btrfs_match_dir_item_name(root, path, name, name_len);
}
+struct btrfs_dir_item *
+btrfs_search_dir_index_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 dirid,
+ const char *name, int name_len)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ u32 nritems;
+ int ret;
+
+ key.objectid = dirid;
+ key.type = BTRFS_DIR_INDEX_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ERR_PTR(ret);
+
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+
+ while (1) {
+ if (path->slots[0] >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ break;
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
+ break;
+
+ di = btrfs_match_dir_item_name(root, path, name, name_len);
+ if (di)
+ return di;
+
+ path->slots[0]++;
+ }
+ return NULL;
+}
+
struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d28d29c95f7c..af0435f79fa6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -41,6 +41,7 @@
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
+static void free_fs_root(struct btrfs_root *root);
static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
@@ -123,15 +124,15 @@ static struct extent_map *btree_get_extent(struct inode *inode,
struct extent_map *em;
int ret;
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (em) {
em->bdev =
BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
goto out;
}
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
em = alloc_extent_map(GFP_NOFS);
if (!em) {
@@ -144,7 +145,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
em->block_start = 0;
em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
if (ret == -EEXIST) {
u64 failed_start = em->start;
@@ -163,7 +164,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
free_extent_map(em);
em = NULL;
}
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
if (ret)
em = ERR_PTR(ret);
@@ -772,7 +773,7 @@ static void btree_invalidatepage(struct page *page, unsigned long offset)
}
}
-static struct address_space_operations btree_aops = {
+static const struct address_space_operations btree_aops = {
.readpage = btree_readpage,
.writepage = btree_writepage,
.writepages = btree_writepages,
@@ -821,14 +822,14 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
int btrfs_write_tree_block(struct extent_buffer *buf)
{
- return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start,
- buf->start + buf->len - 1, WB_SYNC_ALL);
+ return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
+ buf->start + buf->len - 1);
}
int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
{
- return btrfs_wait_on_page_writeback_range(buf->first_page->mapping,
- buf->start, buf->start + buf->len - 1);
+ return filemap_fdatawait_range(buf->first_page->mapping,
+ buf->start, buf->start + buf->len - 1);
}
struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
@@ -895,8 +896,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->fs_info = fs_info;
root->objectid = objectid;
root->last_trans = 0;
- root->highest_inode = 0;
- root->last_inode_alloc = 0;
+ root->highest_objectid = 0;
root->name = NULL;
root->in_sysfs = 0;
root->inode_tree.rb_node = NULL;
@@ -952,14 +952,16 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
root, fs_info, objectid);
ret = btrfs_find_last_root(tree_root, objectid,
&root->root_item, &root->root_key);
+ if (ret > 0)
+ return -ENOENT;
BUG_ON(ret);
generation = btrfs_root_generation(&root->root_item);
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
- root->commit_root = btrfs_root_node(root);
BUG_ON(!root->node);
+ root->commit_root = btrfs_root_node(root);
return 0;
}
@@ -1095,7 +1097,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
struct btrfs_fs_info *fs_info = tree_root->fs_info;
struct btrfs_path *path;
struct extent_buffer *l;
- u64 highest_inode;
u64 generation;
u32 blocksize;
int ret = 0;
@@ -1110,7 +1111,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
kfree(root);
return ERR_PTR(ret);
}
- goto insert;
+ goto out;
}
__setup_root(tree_root->nodesize, tree_root->leafsize,
@@ -1120,39 +1121,30 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
path = btrfs_alloc_path();
BUG_ON(!path);
ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
- if (ret != 0) {
- if (ret > 0)
- ret = -ENOENT;
- goto out;
+ if (ret == 0) {
+ l = path->nodes[0];
+ read_extent_buffer(l, &root->root_item,
+ btrfs_item_ptr_offset(l, path->slots[0]),
+ sizeof(root->root_item));
+ memcpy(&root->root_key, location, sizeof(*location));
}
- l = path->nodes[0];
- read_extent_buffer(l, &root->root_item,
- btrfs_item_ptr_offset(l, path->slots[0]),
- sizeof(root->root_item));
- memcpy(&root->root_key, location, sizeof(*location));
- ret = 0;
-out:
- btrfs_release_path(root, path);
btrfs_free_path(path);
if (ret) {
- kfree(root);
+ if (ret > 0)
+ ret = -ENOENT;
return ERR_PTR(ret);
}
+
generation = btrfs_root_generation(&root->root_item);
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
root->commit_root = btrfs_root_node(root);
BUG_ON(!root->node);
-insert:
- if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+out:
+ if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
root->ref_cows = 1;
- ret = btrfs_find_highest_inode(root, &highest_inode);
- if (ret == 0) {
- root->highest_inode = highest_inode;
- root->last_inode_alloc = highest_inode;
- }
- }
+
return root;
}
@@ -1187,39 +1179,66 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
return fs_info->dev_root;
if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
return fs_info->csum_root;
-
+again:
+ spin_lock(&fs_info->fs_roots_radix_lock);
root = radix_tree_lookup(&fs_info->fs_roots_radix,
(unsigned long)location->objectid);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
if (root)
return root;
+ ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
+ if (ret == 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ return ERR_PTR(ret);
+
root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
if (IS_ERR(root))
return root;
+ WARN_ON(btrfs_root_refs(&root->root_item) == 0);
set_anon_super(&root->anon_super, NULL);
+ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ if (ret)
+ goto fail;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
ret = radix_tree_insert(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
root);
+ if (ret == 0)
+ root->in_radix = 1;
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ radix_tree_preload_end();
if (ret) {
- free_extent_buffer(root->node);
- kfree(root);
- return ERR_PTR(ret);
+ if (ret == -EEXIST) {
+ free_fs_root(root);
+ goto again;
+ }
+ goto fail;
}
- if (!(fs_info->sb->s_flags & MS_RDONLY)) {
- ret = btrfs_find_dead_roots(fs_info->tree_root,
- root->root_key.objectid);
- BUG_ON(ret);
+
+ ret = btrfs_find_dead_roots(fs_info->tree_root,
+ root->root_key.objectid);
+ WARN_ON(ret);
+
+ if (!(fs_info->sb->s_flags & MS_RDONLY))
btrfs_orphan_cleanup(root);
- }
+
return root;
+fail:
+ free_fs_root(root);
+ return ERR_PTR(ret);
}
struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_key *location,
const char *name, int namelen)
{
+ return btrfs_read_fs_root_no_name(fs_info, location);
+#if 0
struct btrfs_root *root;
int ret;
@@ -1236,7 +1255,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
kfree(root);
return ERR_PTR(ret);
}
-#if 0
+
ret = btrfs_sysfs_add_root(root);
if (ret) {
free_extent_buffer(root->node);
@@ -1244,9 +1263,9 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
kfree(root);
return ERR_PTR(ret);
}
-#endif
root->in_sysfs = 1;
return root;
+#endif
}
static int btrfs_congested_fn(void *congested_data, int bdi_bits)
@@ -1325,9 +1344,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
offset = page_offset(page);
em_tree = &BTRFS_I(inode)->extent_tree;
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
if (!em) {
__unplug_io_fn(bdi, page);
return;
@@ -1352,6 +1371,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
{
int err;
+ bdi->name = "btrfs";
bdi->capabilities = BDI_CAP_MAP_COPY;
err = bdi_init(bdi);
if (err)
@@ -1359,8 +1379,10 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
err = bdi_register(bdi, NULL, "btrfs-%d",
atomic_inc_return(&btrfs_bdi_num));
- if (err)
+ if (err) {
+ bdi_destroy(bdi);
return err;
+ }
bdi->ra_pages = default_backing_dev_info.ra_pages;
bdi->unplug_io_fn = btrfs_unplug_io_fn;
@@ -1450,9 +1472,12 @@ static int cleaner_kthread(void *arg)
break;
vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
- mutex_lock(&root->fs_info->cleaner_mutex);
- btrfs_clean_old_snapshots(root);
- mutex_unlock(&root->fs_info->cleaner_mutex);
+
+ if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
+ mutex_trylock(&root->fs_info->cleaner_mutex)) {
+ btrfs_clean_old_snapshots(root);
+ mutex_unlock(&root->fs_info->cleaner_mutex);
+ }
if (freezing(current)) {
refrigerator();
@@ -1557,15 +1582,36 @@ struct btrfs_root *open_ctree(struct super_block *sb,
err = -ENOMEM;
goto fail;
}
- INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
+
+ ret = init_srcu_struct(&fs_info->subvol_srcu);
+ if (ret) {
+ err = ret;
+ goto fail;
+ }
+
+ ret = setup_bdi(fs_info, &fs_info->bdi);
+ if (ret) {
+ err = ret;
+ goto fail_srcu;
+ }
+
+ fs_info->btree_inode = new_inode(sb);
+ if (!fs_info->btree_inode) {
+ err = -ENOMEM;
+ goto fail_bdi;
+ }
+
+ INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->hashers);
INIT_LIST_HEAD(&fs_info->delalloc_inodes);
INIT_LIST_HEAD(&fs_info->ordered_operations);
+ INIT_LIST_HEAD(&fs_info->caching_block_groups);
spin_lock_init(&fs_info->delalloc_lock);
spin_lock_init(&fs_info->new_trans_lock);
spin_lock_init(&fs_info->ref_cache_lock);
+ spin_lock_init(&fs_info->fs_roots_radix_lock);
init_completion(&fs_info->kobj_unregister);
fs_info->tree_root = tree_root;
@@ -1584,12 +1630,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->sb = sb;
fs_info->max_extent = (u64)-1;
fs_info->max_inline = 8192 * 1024;
- if (setup_bdi(fs_info, &fs_info->bdi))
- goto fail_bdi;
- fs_info->btree_inode = new_inode(sb);
- fs_info->btree_inode->i_ino = 1;
- fs_info->btree_inode->i_nlink = 1;
- fs_info->metadata_ratio = 8;
+ fs_info->metadata_ratio = 0;
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
@@ -1599,7 +1640,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
+ sb->s_bdi = &fs_info->bdi;
+ fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+ fs_info->btree_inode->i_nlink = 1;
/*
* we set the i_size on the btree inode to the max possible int.
* the real end of the address space is determined by all of
@@ -1618,27 +1662,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
+ BTRFS_I(fs_info->btree_inode)->root = tree_root;
+ memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
+ sizeof(struct btrfs_key));
+ BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
+ insert_inode_hash(fs_info->btree_inode);
+
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree.rb_node = NULL;
- extent_io_tree_init(&fs_info->pinned_extents,
+ extent_io_tree_init(&fs_info->freed_extents[0],
+ fs_info->btree_inode->i_mapping, GFP_NOFS);
+ extent_io_tree_init(&fs_info->freed_extents[1],
fs_info->btree_inode->i_mapping, GFP_NOFS);
+ fs_info->pinned_extents = &fs_info->freed_extents[0];
fs_info->do_barriers = 1;
- BTRFS_I(fs_info->btree_inode)->root = tree_root;
- memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
- sizeof(struct btrfs_key));
- insert_inode_hash(fs_info->btree_inode);
mutex_init(&fs_info->trans_mutex);
mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
- mutex_init(&fs_info->drop_mutex);
mutex_init(&fs_info->chunk_mutex);
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
- mutex_init(&fs_info->tree_reloc_mutex);
+ init_rwsem(&fs_info->extent_commit_sem);
+ init_rwsem(&fs_info->subvol_sem);
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -1697,7 +1746,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
err = -EINVAL;
goto fail_iput;
}
-
+printk("thread pool is %d\n", fs_info->thread_pool_size);
/*
* we need to start all the end_io workers up front because the
* queue work function gets called at interrupt time, and so it
@@ -1742,20 +1791,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->endio_workers.idle_thresh = 4;
fs_info->endio_meta_workers.idle_thresh = 4;
- fs_info->endio_write_workers.idle_thresh = 64;
- fs_info->endio_meta_write_workers.idle_thresh = 64;
+ fs_info->endio_write_workers.idle_thresh = 2;
+ fs_info->endio_meta_write_workers.idle_thresh = 2;
+
+ fs_info->endio_workers.atomic_worker_start = 1;
+ fs_info->endio_meta_workers.atomic_worker_start = 1;
+ fs_info->endio_write_workers.atomic_worker_start = 1;
+ fs_info->endio_meta_write_workers.atomic_worker_start = 1;
btrfs_start_workers(&fs_info->workers, 1);
btrfs_start_workers(&fs_info->submit_workers, 1);
btrfs_start_workers(&fs_info->delalloc_workers, 1);
btrfs_start_workers(&fs_info->fixup_workers, 1);
- btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
- btrfs_start_workers(&fs_info->endio_meta_workers,
- fs_info->thread_pool_size);
- btrfs_start_workers(&fs_info->endio_meta_write_workers,
- fs_info->thread_pool_size);
- btrfs_start_workers(&fs_info->endio_write_workers,
- fs_info->thread_pool_size);
+ btrfs_start_workers(&fs_info->endio_workers, 1);
+ btrfs_start_workers(&fs_info->endio_meta_workers, 1);
+ btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
+ btrfs_start_workers(&fs_info->endio_write_workers, 1);
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1799,6 +1850,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_super_chunk_root(disk_super),
blocksize, generation);
BUG_ON(!chunk_root->node);
+ if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+ printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
+ sb->s_id);
+ goto fail_chunk_root;
+ }
btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1826,6 +1882,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
blocksize, generation);
if (!tree_root->node)
goto fail_chunk_root;
+ if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+ printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
+ sb->s_id);
+ goto fail_tree_root;
+ }
btrfs_set_root_node(&tree_root->root_item, tree_root->node);
tree_root->commit_root = btrfs_root_node(tree_root);
@@ -1905,6 +1966,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
}
}
+ ret = btrfs_find_orphan_roots(tree_root);
+ BUG_ON(ret);
+
if (!(sb->s_flags & MS_RDONLY)) {
ret = btrfs_recover_relocation(tree_root);
BUG_ON(ret);
@@ -1964,6 +2028,8 @@ fail_iput:
btrfs_mapping_tree_free(&fs_info->mapping_tree);
fail_bdi:
bdi_destroy(&fs_info->bdi);
+fail_srcu:
+ cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
kfree(extent_root);
kfree(tree_root);
@@ -2223,20 +2289,29 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
{
- WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+ spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_delete(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid);
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ if (btrfs_root_refs(&root->root_item) == 0)
+ synchronize_srcu(&fs_info->subvol_srcu);
+
+ free_fs_root(root);
+ return 0;
+}
+
+static void free_fs_root(struct btrfs_root *root)
+{
+ WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
if (root->anon_super.s_dev) {
down_write(&root->anon_super.s_umount);
kill_anon_super(&root->anon_super);
}
- if (root->node)
- free_extent_buffer(root->node);
- if (root->commit_root)
- free_extent_buffer(root->commit_root);
+ free_extent_buffer(root->node);
+ free_extent_buffer(root->commit_root);
kfree(root->name);
kfree(root);
- return 0;
}
static int del_fs_roots(struct btrfs_fs_info *fs_info)
@@ -2245,6 +2320,20 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
struct btrfs_root *gang[8];
int i;
+ while (!list_empty(&fs_info->dead_roots)) {
+ gang[0] = list_entry(fs_info->dead_roots.next,
+ struct btrfs_root, root_list);
+ list_del(&gang[0]->root_list);
+
+ if (gang[0]->in_radix) {
+ btrfs_free_fs_root(fs_info, gang[0]);
+ } else {
+ free_extent_buffer(gang[0]->node);
+ free_extent_buffer(gang[0]->commit_root);
+ kfree(gang[0]);
+ }
+ }
+
while (1) {
ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)gang, 0,
@@ -2274,9 +2363,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
root_objectid = gang[ret - 1]->root_key.objectid + 1;
for (i = 0; i < ret; i++) {
root_objectid = gang[i]->root_key.objectid;
- ret = btrfs_find_dead_roots(fs_info->tree_root,
- root_objectid);
- BUG_ON(ret);
btrfs_orphan_cleanup(gang[i]);
}
root_objectid++;
@@ -2322,6 +2408,9 @@ int close_ctree(struct btrfs_root *root)
printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
}
+ fs_info->closing = 2;
+ smp_mb();
+
if (fs_info->delalloc_bytes) {
printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
(unsigned long long)fs_info->delalloc_bytes);
@@ -2361,6 +2450,7 @@ int close_ctree(struct btrfs_root *root)
btrfs_mapping_tree_free(&fs_info->mapping_tree);
bdi_destroy(&fs_info->bdi);
+ cleanup_srcu_struct(&fs_info->subvol_srcu);
kfree(fs_info->extent_root);
kfree(fs_info->tree_root);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 9596b40caa4e..ba5c3fd5ab8c 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -28,7 +28,7 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
len = BTRFS_FID_SIZE_NON_CONNECTABLE;
type = FILEID_BTRFS_WITHOUT_PARENT;
- fid->objectid = BTRFS_I(inode)->location.objectid;
+ fid->objectid = inode->i_ino;
fid->root_objectid = BTRFS_I(inode)->root->objectid;
fid->gen = inode->i_generation;
@@ -60,34 +60,61 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
}
static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation)
+ u64 root_objectid, u32 generation,
+ int check_generation)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
struct btrfs_root *root;
+ struct dentry *dentry;
struct inode *inode;
struct btrfs_key key;
+ int index;
+ int err = 0;
+
+ if (objectid < BTRFS_FIRST_FREE_OBJECTID)
+ return ERR_PTR(-ESTALE);
key.objectid = root_objectid;
btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
key.offset = (u64)-1;
- root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
- if (IS_ERR(root))
- return ERR_CAST(root);
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+
+ root = btrfs_read_fs_root_no_name(fs_info, &key);
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ goto fail;
+ }
+
+ if (btrfs_root_refs(&root->root_item) == 0) {
+ err = -ENOENT;
+ goto fail;
+ }
key.objectid = objectid;
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
key.offset = 0;
inode = btrfs_iget(sb, &key, root);
- if (IS_ERR(inode))
- return (void *)inode;
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto fail;
+ }
+
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
- if (generation != inode->i_generation) {
+ if (check_generation && generation != inode->i_generation) {
iput(inode);
return ERR_PTR(-ESTALE);
}
- return d_obtain_alias(inode);
+ dentry = d_obtain_alias(inode);
+ if (!IS_ERR(dentry))
+ dentry->d_op = &btrfs_dentry_operations;
+ return dentry;
+fail:
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ return ERR_PTR(err);
}
static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
@@ -111,7 +138,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
objectid = fid->parent_objectid;
generation = fid->parent_gen;
- return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+ return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
}
static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
@@ -133,66 +160,76 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
root_objectid = fid->root_objectid;
generation = fid->gen;
- return btrfs_get_dentry(sb, objectid, root_objectid, generation);
+ return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
}
static struct dentry *btrfs_get_parent(struct dentry *child)
{
struct inode *dir = child->d_inode;
+ static struct dentry *dentry;
struct btrfs_root *root = BTRFS_I(dir)->root;
- struct btrfs_key key;
struct btrfs_path *path;
struct extent_buffer *leaf;
- int slot;
- u64 objectid;
+ struct btrfs_root_ref *ref;
+ struct btrfs_key key;
+ struct btrfs_key found_key;
int ret;
path = btrfs_alloc_path();
- key.objectid = dir->i_ino;
- btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
- key.offset = (u64)-1;
+ if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ key.objectid = root->root_key.objectid;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
+ key.offset = (u64)-1;
+ root = root->fs_info->tree_root;
+ } else {
+ key.objectid = dir->i_ino;
+ key.type = BTRFS_INODE_REF_KEY;
+ key.offset = (u64)-1;
+ }
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- /* Error */
- btrfs_free_path(path);
- return ERR_PTR(ret);
+ if (ret < 0)
+ goto fail;
+
+ BUG_ON(ret == 0);
+ if (path->slots[0] == 0) {
+ ret = -ENOENT;
+ goto fail;
}
+
+ path->slots[0]--;
leaf = path->nodes[0];
- slot = path->slots[0];
- if (ret) {
- /* btrfs_search_slot() returns the slot where we'd want to
- insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
- The _real_ backref, telling us what the parent inode
- _actually_ is, will be in the slot _before_ the one
- that btrfs_search_slot() returns. */
- if (!slot) {
- /* Unless there is _no_ key in the tree before... */
- btrfs_free_path(path);
- return ERR_PTR(-EIO);
- }
- slot--;
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != key.objectid || found_key.type != key.type) {
+ ret = -ENOENT;
+ goto fail;
}
- btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_root_ref);
+ key.objectid = btrfs_root_ref_dirid(leaf, ref);
+ } else {
+ key.objectid = found_key.offset;
+ }
btrfs_free_path(path);
- if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
- return ERR_PTR(-EINVAL);
-
- objectid = key.offset;
-
- /* If we are already at the root of a subvol, return the real root */
- if (objectid == dir->i_ino)
- return dget(dir->i_sb->s_root);
+ if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
+ return btrfs_get_dentry(root->fs_info->sb, key.objectid,
+ found_key.offset, 0, 0);
+ }
- /* Build a new key for the inode item */
- key.objectid = objectid;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
-
- return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
+ dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
+ if (!IS_ERR(dentry))
+ dentry->d_op = &btrfs_dentry_operations;
+ return dentry;
+fail:
+ btrfs_free_path(path);
+ return ERR_PTR(ret);
}
const struct export_operations btrfs_export_ops = {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index edc7d208c5ce..359a754c782c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -21,6 +21,7 @@
#include <linux/blkdev.h>
#include <linux/sort.h>
#include <linux/rcupdate.h>
+#include <linux/kthread.h>
#include "compat.h"
#include "hash.h"
#include "ctree.h"
@@ -31,12 +32,12 @@
#include "locking.h"
#include "free-space-cache.h"
-static int update_reserved_extents(struct btrfs_root *root,
- u64 bytenr, u64 num, int reserve);
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc,
int mark_free);
+static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -56,10 +57,26 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
int level, struct btrfs_key *ins);
-
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 alloc_bytes,
u64 flags, int force);
+static int pin_down_bytes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 bytenr, u64 num_bytes,
+ int is_data, int reserved,
+ struct extent_buffer **must_clean);
+static int find_next_key(struct btrfs_path *path, int level,
+ struct btrfs_key *key);
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+ int dump_block_groups);
+
+static noinline int
+block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+ smp_mb();
+ return cache->cached == BTRFS_CACHE_FINISHED;
+}
static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
{
@@ -145,21 +162,96 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
return ret;
}
+static int add_excluded_extent(struct btrfs_root *root,
+ u64 start, u64 num_bytes)
+{
+ u64 end = start + num_bytes - 1;
+ set_extent_bits(&root->fs_info->freed_extents[0],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
+ set_extent_bits(&root->fs_info->freed_extents[1],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
+ return 0;
+}
+
+static void free_excluded_extents(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
+{
+ u64 start, end;
+
+ start = cache->key.objectid;
+ end = start + cache->key.offset - 1;
+
+ clear_extent_bits(&root->fs_info->freed_extents[0],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
+ clear_extent_bits(&root->fs_info->freed_extents[1],
+ start, end, EXTENT_UPTODATE, GFP_NOFS);
+}
+
+static int exclude_super_stripes(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
+{
+ u64 bytenr;
+ u64 *logical;
+ int stripe_len;
+ int i, nr, ret;
+
+ for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ bytenr = btrfs_sb_offset(i);
+ ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+ cache->key.objectid, bytenr,
+ 0, &logical, &nr, &stripe_len);
+ BUG_ON(ret);
+
+ while (nr--) {
+ cache->bytes_super += stripe_len;
+ ret = add_excluded_extent(root, logical[nr],
+ stripe_len);
+ BUG_ON(ret);
+ }
+
+ kfree(logical);
+ }
+ return 0;
+}
+
+static struct btrfs_caching_control *
+get_caching_control(struct btrfs_block_group_cache *cache)
+{
+ struct btrfs_caching_control *ctl;
+
+ spin_lock(&cache->lock);
+ if (cache->cached != BTRFS_CACHE_STARTED) {
+ spin_unlock(&cache->lock);
+ return NULL;
+ }
+
+ ctl = cache->caching_ctl;
+ atomic_inc(&ctl->count);
+ spin_unlock(&cache->lock);
+ return ctl;
+}
+
+static void put_caching_control(struct btrfs_caching_control *ctl)
+{
+ if (atomic_dec_and_test(&ctl->count))
+ kfree(ctl);
+}
+
/*
* this is only called by cache_block_group, since we could have freed extents
* we need to check the pinned_extents for any extents that can't be used yet
* since their free space will be released as soon as the transaction commits.
*/
-static int add_new_free_space(struct btrfs_block_group_cache *block_group,
+static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
struct btrfs_fs_info *info, u64 start, u64 end)
{
- u64 extent_start, extent_end, size;
+ u64 extent_start, extent_end, size, total_added = 0;
int ret;
while (start < end) {
- ret = find_first_extent_bit(&info->pinned_extents, start,
+ ret = find_first_extent_bit(info->pinned_extents, start,
&extent_start, &extent_end,
- EXTENT_DIRTY);
+ EXTENT_DIRTY | EXTENT_UPTODATE);
if (ret)
break;
@@ -167,6 +259,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
start = extent_end + 1;
} else if (extent_start > start && extent_start < end) {
size = extent_start - start;
+ total_added += size;
ret = btrfs_add_free_space(block_group, start,
size);
BUG_ON(ret);
@@ -178,112 +271,183 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
if (start < end) {
size = end - start;
+ total_added += size;
ret = btrfs_add_free_space(block_group, start, size);
BUG_ON(ret);
}
- return 0;
-}
-
-static int remove_sb_from_cache(struct btrfs_root *root,
- struct btrfs_block_group_cache *cache)
-{
- u64 bytenr;
- u64 *logical;
- int stripe_len;
- int i, nr, ret;
-
- for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
- bytenr = btrfs_sb_offset(i);
- ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
- cache->key.objectid, bytenr, 0,
- &logical, &nr, &stripe_len);
- BUG_ON(ret);
- while (nr--) {
- btrfs_remove_free_space(cache, logical[nr],
- stripe_len);
- }
- kfree(logical);
- }
- return 0;
+ return total_added;
}
-static int cache_block_group(struct btrfs_root *root,
- struct btrfs_block_group_cache *block_group)
+static int caching_kthread(void *data)
{
+ struct btrfs_block_group_cache *block_group = data;
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
+ struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
+ struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_path *path;
- int ret = 0;
- struct btrfs_key key;
struct extent_buffer *leaf;
- int slot;
- u64 last;
-
- if (!block_group)
- return 0;
-
- root = root->fs_info->extent_root;
-
- if (block_group->cached)
- return 0;
+ struct btrfs_key key;
+ u64 total_found = 0;
+ u64 last = 0;
+ u32 nritems;
+ int ret = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = 2;
+ exclude_super_stripes(extent_root, block_group);
+ spin_lock(&block_group->space_info->lock);
+ block_group->space_info->bytes_super += block_group->bytes_super;
+ spin_unlock(&block_group->space_info->lock);
+
+ last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+
/*
- * we get into deadlocks with paths held by callers of this function.
- * since the alloc_mutex is protecting things right now, just
- * skip the locking here
+ * We don't want to deadlock with somebody trying to allocate a new
+ * extent for the extent root while also trying to search the extent
+ * root to add free space. So we skip locking and search the commit
+ * root, since its read-only
*/
path->skip_locking = 1;
- last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+ path->search_commit_root = 1;
+ path->reada = 2;
+
key.objectid = last;
key.offset = 0;
- btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+again:
+ mutex_lock(&caching_ctl->mutex);
+ /* need to make sure the commit_root doesn't disappear */
+ down_read(&fs_info->extent_commit_sem);
+
+ ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
goto err;
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+
while (1) {
- leaf = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(leaf)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto err;
- if (ret == 0)
- continue;
- else
+ smp_mb();
+ if (fs_info->closing > 1) {
+ last = (u64)-1;
+ break;
+ }
+
+ if (path->slots[0] < nritems) {
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ } else {
+ ret = find_next_key(path, 0, &key);
+ if (ret)
break;
+
+ caching_ctl->progress = last;
+ btrfs_release_path(extent_root, path);
+ up_read(&fs_info->extent_commit_sem);
+ mutex_unlock(&caching_ctl->mutex);
+ if (btrfs_transaction_in_commit(fs_info))
+ schedule_timeout(1);
+ else
+ cond_resched();
+ goto again;
+ }
+
+ if (key.objectid < block_group->key.objectid) {
+ path->slots[0]++;
+ continue;
}
- btrfs_item_key_to_cpu(leaf, &key, slot);
- if (key.objectid < block_group->key.objectid)
- goto next;
if (key.objectid >= block_group->key.objectid +
block_group->key.offset)
break;
- if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
- add_new_free_space(block_group, root->fs_info, last,
- key.objectid);
-
+ if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+ total_found += add_new_free_space(block_group,
+ fs_info, last,
+ key.objectid);
last = key.objectid + key.offset;
+
+ if (total_found > (1024 * 1024 * 2)) {
+ total_found = 0;
+ wake_up(&caching_ctl->wait);
+ }
}
-next:
path->slots[0]++;
}
+ ret = 0;
- add_new_free_space(block_group, root->fs_info, last,
- block_group->key.objectid +
- block_group->key.offset);
+ total_found += add_new_free_space(block_group, fs_info, last,
+ block_group->key.objectid +
+ block_group->key.offset);
+ caching_ctl->progress = (u64)-1;
+
+ spin_lock(&block_group->lock);
+ block_group->caching_ctl = NULL;
+ block_group->cached = BTRFS_CACHE_FINISHED;
+ spin_unlock(&block_group->lock);
- block_group->cached = 1;
- remove_sb_from_cache(root, block_group);
- ret = 0;
err:
btrfs_free_path(path);
+ up_read(&fs_info->extent_commit_sem);
+
+ free_excluded_extents(extent_root, block_group);
+
+ mutex_unlock(&caching_ctl->mutex);
+ wake_up(&caching_ctl->wait);
+
+ put_caching_control(caching_ctl);
+ atomic_dec(&block_group->space_info->caching_threads);
+ return 0;
+}
+
+static int cache_block_group(struct btrfs_block_group_cache *cache)
+{
+ struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_caching_control *caching_ctl;
+ struct task_struct *tsk;
+ int ret = 0;
+
+ smp_mb();
+ if (cache->cached != BTRFS_CACHE_NO)
+ return 0;
+
+ caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
+ BUG_ON(!caching_ctl);
+
+ INIT_LIST_HEAD(&caching_ctl->list);
+ mutex_init(&caching_ctl->mutex);
+ init_waitqueue_head(&caching_ctl->wait);
+ caching_ctl->block_group = cache;
+ caching_ctl->progress = cache->key.objectid;
+ /* one for caching kthread, one for caching block group list */
+ atomic_set(&caching_ctl->count, 2);
+
+ spin_lock(&cache->lock);
+ if (cache->cached != BTRFS_CACHE_NO) {
+ spin_unlock(&cache->lock);
+ kfree(caching_ctl);
+ return 0;
+ }
+ cache->caching_ctl = caching_ctl;
+ cache->cached = BTRFS_CACHE_STARTED;
+ spin_unlock(&cache->lock);
+
+ down_write(&fs_info->extent_commit_sem);
+ list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
+ up_write(&fs_info->extent_commit_sem);
+
+ atomic_inc(&cache->space_info->caching_threads);
+
+ tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
+ cache->key.objectid);
+ if (IS_ERR(tsk)) {
+ ret = PTR_ERR(tsk);
+ printk(KERN_ERR "error running thread %d\n", ret);
+ BUG();
+ }
+
return ret;
}
@@ -990,15 +1154,13 @@ static inline int extent_ref_type(u64 parent, u64 owner)
return type;
}
-static int find_next_key(struct btrfs_path *path, struct btrfs_key *key)
+static int find_next_key(struct btrfs_path *path, int level,
+ struct btrfs_key *key)
{
- int level;
- BUG_ON(!path->keep_locks);
- for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+ for (; level < BTRFS_MAX_LEVEL; level++) {
if (!path->nodes[level])
break;
- btrfs_assert_tree_locked(path->nodes[level]);
if (path->slots[level] + 1 >=
btrfs_header_nritems(path->nodes[level]))
continue;
@@ -1158,7 +1320,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
* For simplicity, we just do not add new inline back
* ref if there is any kind of item for this block
*/
- if (find_next_key(path, &key) == 0 && key.objectid == bytenr &&
+ if (find_next_key(path, 0, &key) == 0 &&
+ key.objectid == bytenr &&
key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
err = -EAGAIN;
goto out;
@@ -1409,7 +1572,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
static void btrfs_issue_discard(struct block_device *bdev,
u64 start, u64 len)
{
- blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+ blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
+ DISCARD_FL_BARRIER);
}
#endif
@@ -1554,7 +1718,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
parent, ref_root, flags,
ref->objectid, ref->offset,
&ins, node->ref_mod);
- update_reserved_extents(root, ins.objectid, ins.offset, 0);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
node->num_bytes, parent,
@@ -1680,7 +1843,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
extent_op->flags_to_set,
&extent_op->key,
ref->level, &ins);
- update_reserved_extents(root, ins.objectid, ins.offset, 0);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
node->num_bytes, parent, ref_root,
@@ -1715,16 +1877,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
BUG_ON(extent_op);
head = btrfs_delayed_node_to_head(node);
if (insert_reserved) {
+ int mark_free = 0;
+ struct extent_buffer *must_clean = NULL;
+
+ ret = pin_down_bytes(trans, root, NULL,
+ node->bytenr, node->num_bytes,
+ head->is_data, 1, &must_clean);
+ if (ret > 0)
+ mark_free = 1;
+
+ if (must_clean) {
+ clean_tree_block(NULL, root, must_clean);
+ btrfs_tree_unlock(must_clean);
+ free_extent_buffer(must_clean);
+ }
if (head->is_data) {
ret = btrfs_del_csums(trans, root,
node->bytenr,
node->num_bytes);
BUG_ON(ret);
}
- btrfs_update_pinned_extents(root, node->bytenr,
- node->num_bytes, 1);
- update_reserved_extents(root, node->bytenr,
- node->num_bytes, 0);
+ if (mark_free) {
+ ret = btrfs_free_reserved_extent(root,
+ node->bytenr,
+ node->num_bytes);
+ BUG_ON(ret);
+ }
}
mutex_unlock(&head->mutex);
return 0;
@@ -2388,13 +2566,29 @@ fail:
}
+static struct btrfs_block_group_cache *
+next_block_group(struct btrfs_root *root,
+ struct btrfs_block_group_cache *cache)
+{
+ struct rb_node *node;
+ spin_lock(&root->fs_info->block_group_cache_lock);
+ node = rb_next(&cache->cache_node);
+ btrfs_put_block_group(cache);
+ if (node) {
+ cache = rb_entry(node, struct btrfs_block_group_cache,
+ cache_node);
+ atomic_inc(&cache->count);
+ } else
+ cache = NULL;
+ spin_unlock(&root->fs_info->block_group_cache_lock);
+ return cache;
+}
+
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- struct btrfs_block_group_cache *cache, *entry;
- struct rb_node *n;
+ struct btrfs_block_group_cache *cache;
int err = 0;
- int werr = 0;
struct btrfs_path *path;
u64 last = 0;
@@ -2403,39 +2597,35 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
return -ENOMEM;
while (1) {
- cache = NULL;
- spin_lock(&root->fs_info->block_group_cache_lock);
- for (n = rb_first(&root->fs_info->block_group_cache_tree);
- n; n = rb_next(n)) {
- entry = rb_entry(n, struct btrfs_block_group_cache,
- cache_node);
- if (entry->dirty) {
- cache = entry;
- break;
- }
+ if (last == 0) {
+ err = btrfs_run_delayed_refs(trans, root,
+ (unsigned long)-1);
+ BUG_ON(err);
}
- spin_unlock(&root->fs_info->block_group_cache_lock);
- if (!cache)
- break;
+ cache = btrfs_lookup_first_block_group(root->fs_info, last);
+ while (cache) {
+ if (cache->dirty)
+ break;
+ cache = next_block_group(root, cache);
+ }
+ if (!cache) {
+ if (last == 0)
+ break;
+ last = 0;
+ continue;
+ }
cache->dirty = 0;
- last += cache->key.offset;
+ last = cache->key.objectid + cache->key.offset;
- err = write_one_cache_group(trans, root,
- path, cache);
- /*
- * if we fail to write the cache group, we want
- * to keep it marked dirty in hopes that a later
- * write will work
- */
- if (err) {
- werr = err;
- continue;
- }
+ err = write_one_cache_group(trans, root, path, cache);
+ BUG_ON(err);
+ btrfs_put_block_group(cache);
}
+
btrfs_free_path(path);
- return werr;
+ return 0;
}
int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -2485,6 +2675,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->force_alloc = 0;
*space_info = found;
list_add_rcu(&found->list, &info->space_info);
+ atomic_set(&found->caching_threads, 0);
return 0;
}
@@ -2576,60 +2767,346 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
alloc_target);
}
+static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
+{
+ u64 num_bytes;
+ int level;
+
+ level = BTRFS_MAX_LEVEL - 2;
+ /*
+ * NOTE: these calculations are absolutely the worst possible case.
+ * This assumes that _every_ item we insert will require a new leaf, and
+ * that the tree has grown to its maximum level size.
+ */
+
+ /*
+ * for every item we insert we could insert both an extent item and a
+ * extent ref item. Then for ever item we insert, we will need to cow
+ * both the original leaf, plus the leaf to the left and right of it.
+ *
+ * Unless we are talking about the extent root, then we just want the
+ * number of items * 2, since we just need the extent item plus its ref.
+ */
+ if (root == root->fs_info->extent_root)
+ num_bytes = num_items * 2;
+ else
+ num_bytes = (num_items + (2 * num_items)) * 3;
+
+ /*
+ * num_bytes is total number of leaves we could need times the leaf
+ * size, and then for every leaf we could end up cow'ing 2 nodes per
+ * level, down to the leaf level.
+ */
+ num_bytes = (num_bytes * root->leafsize) +
+ (num_bytes * (level * 2)) * root->nodesize;
+
+ return num_bytes;
+}
+
/*
- * for now this just makes sure we have at least 5% of our metadata space free
- * for use.
+ * Unreserve metadata space for delalloc. If we have less reserved credits than
+ * we have extents, this function does nothing.
*/
-int btrfs_check_metadata_free_space(struct btrfs_root *root)
+int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items)
{
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_space_info *meta_sinfo;
- u64 alloc_target, thresh;
- int committed = 0, ret;
+ u64 num_bytes;
+ u64 alloc_target;
+ bool bug = false;
/* get the space info for where the metadata will live */
alloc_target = btrfs_get_alloc_profile(root, 0);
meta_sinfo = __find_space_info(info, alloc_target);
-again:
+ num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
+ num_items);
+
spin_lock(&meta_sinfo->lock);
- if (!meta_sinfo->full)
- thresh = meta_sinfo->total_bytes * 80;
- else
- thresh = meta_sinfo->total_bytes * 95;
+ if (BTRFS_I(inode)->delalloc_reserved_extents <=
+ BTRFS_I(inode)->delalloc_extents) {
+ spin_unlock(&meta_sinfo->lock);
+ return 0;
+ }
+
+ BTRFS_I(inode)->delalloc_reserved_extents--;
+ BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0);
+
+ if (meta_sinfo->bytes_delalloc < num_bytes) {
+ bug = true;
+ meta_sinfo->bytes_delalloc = 0;
+ } else {
+ meta_sinfo->bytes_delalloc -= num_bytes;
+ }
+ spin_unlock(&meta_sinfo->lock);
+
+ BUG_ON(bug);
+
+ return 0;
+}
+
+static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
+{
+ u64 thresh;
+
+ thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+ meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+ meta_sinfo->bytes_may_use;
+ thresh = meta_sinfo->total_bytes - thresh;
+ thresh *= 80;
do_div(thresh, 100);
+ if (thresh <= meta_sinfo->bytes_delalloc)
+ meta_sinfo->force_delalloc = 1;
+ else
+ meta_sinfo->force_delalloc = 0;
+}
- if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
- meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
- struct btrfs_trans_handle *trans;
- if (!meta_sinfo->full) {
- meta_sinfo->force_alloc = 1;
- spin_unlock(&meta_sinfo->lock);
+static int maybe_allocate_chunk(struct btrfs_root *root,
+ struct btrfs_space_info *info)
+{
+ struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+ struct btrfs_trans_handle *trans;
+ bool wait = false;
+ int ret = 0;
+ u64 min_metadata;
+ u64 free_space;
- trans = btrfs_start_transaction(root, 1);
- if (!trans)
- return -ENOMEM;
+ free_space = btrfs_super_total_bytes(disk_super);
+ /*
+ * we allow the metadata to grow to a max of either 5gb or 5% of the
+ * space in the volume.
+ */
+ min_metadata = min((u64)5 * 1024 * 1024 * 1024,
+ div64_u64(free_space * 5, 100));
+ if (info->total_bytes >= min_metadata) {
+ spin_unlock(&info->lock);
+ return 0;
+ }
- ret = do_chunk_alloc(trans, root->fs_info->extent_root,
- 2 * 1024 * 1024, alloc_target, 0);
- btrfs_end_transaction(trans, root);
+ if (info->full) {
+ spin_unlock(&info->lock);
+ return 0;
+ }
+
+ if (!info->allocating_chunk) {
+ info->force_alloc = 1;
+ info->allocating_chunk = 1;
+ init_waitqueue_head(&info->wait);
+ } else {
+ wait = true;
+ }
+
+ spin_unlock(&info->lock);
+
+ if (wait) {
+ wait_event(info->wait,
+ !info->allocating_chunk);
+ return 1;
+ }
+
+ trans = btrfs_start_transaction(root, 1);
+ if (!trans) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+ 4096 + 2 * 1024 * 1024,
+ info->flags, 0);
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ goto out;
+out:
+ spin_lock(&info->lock);
+ info->allocating_chunk = 0;
+ spin_unlock(&info->lock);
+ wake_up(&info->wait);
+
+ if (ret)
+ return 0;
+ return 1;
+}
+
+/*
+ * Reserve metadata space for delalloc.
+ */
+int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
+ struct inode *inode, int num_items)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *meta_sinfo;
+ u64 num_bytes;
+ u64 used;
+ u64 alloc_target;
+ int flushed = 0;
+ int force_delalloc;
+
+ /* get the space info for where the metadata will live */
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ meta_sinfo = __find_space_info(info, alloc_target);
+
+ num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
+ num_items);
+again:
+ spin_lock(&meta_sinfo->lock);
+
+ force_delalloc = meta_sinfo->force_delalloc;
+
+ if (unlikely(!meta_sinfo->bytes_root))
+ meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+
+ if (!flushed)
+ meta_sinfo->bytes_delalloc += num_bytes;
+
+ used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+ meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+ meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+
+ if (used > meta_sinfo->total_bytes) {
+ flushed++;
+
+ if (flushed == 1) {
+ if (maybe_allocate_chunk(root, meta_sinfo))
+ goto again;
+ flushed++;
+ } else {
+ spin_unlock(&meta_sinfo->lock);
+ }
+
+ if (flushed == 2) {
+ filemap_flush(inode->i_mapping);
+ goto again;
+ } else if (flushed == 3) {
+ btrfs_start_delalloc_inodes(root);
+ btrfs_wait_ordered_extents(root, 0);
goto again;
}
+ spin_lock(&meta_sinfo->lock);
+ meta_sinfo->bytes_delalloc -= num_bytes;
spin_unlock(&meta_sinfo->lock);
+ printk(KERN_ERR "enospc, has %d, reserved %d\n",
+ BTRFS_I(inode)->delalloc_extents,
+ BTRFS_I(inode)->delalloc_reserved_extents);
+ dump_space_info(meta_sinfo, 0, 0);
+ return -ENOSPC;
+ }
- if (!committed) {
- committed = 1;
- trans = btrfs_join_transaction(root, 1);
- if (!trans)
- return -ENOMEM;
- ret = btrfs_commit_transaction(trans, root);
- if (ret)
- return ret;
+ BTRFS_I(inode)->delalloc_reserved_extents++;
+ check_force_delalloc(meta_sinfo);
+ spin_unlock(&meta_sinfo->lock);
+
+ if (!flushed && force_delalloc)
+ filemap_flush(inode->i_mapping);
+
+ return 0;
+}
+
+/*
+ * unreserve num_items number of items worth of metadata space. This needs to
+ * be paired with btrfs_reserve_metadata_space.
+ *
+ * NOTE: if you have the option, run this _AFTER_ you do a
+ * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
+ * oprations which will result in more used metadata, so we want to make sure we
+ * can do that without issue.
+ */
+int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *meta_sinfo;
+ u64 num_bytes;
+ u64 alloc_target;
+ bool bug = false;
+
+ /* get the space info for where the metadata will live */
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ meta_sinfo = __find_space_info(info, alloc_target);
+
+ num_bytes = calculate_bytes_needed(root, num_items);
+
+ spin_lock(&meta_sinfo->lock);
+ if (meta_sinfo->bytes_may_use < num_bytes) {
+ bug = true;
+ meta_sinfo->bytes_may_use = 0;
+ } else {
+ meta_sinfo->bytes_may_use -= num_bytes;
+ }
+ spin_unlock(&meta_sinfo->lock);
+
+ BUG_ON(bug);
+
+ return 0;
+}
+
+/*
+ * Reserve some metadata space for use. We'll calculate the worste case number
+ * of bytes that would be needed to modify num_items number of items. If we
+ * have space, fantastic, if not, you get -ENOSPC. Please call
+ * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
+ * items you reserved, since whatever metadata you needed should have already
+ * been allocated.
+ *
+ * This will commit the transaction to make more space if we don't have enough
+ * metadata space. THe only time we don't do this is if we're reserving space
+ * inside of a transaction, then we will just return -ENOSPC and it is the
+ * callers responsibility to handle it properly.
+ */
+int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
+{
+ struct btrfs_fs_info *info = root->fs_info;
+ struct btrfs_space_info *meta_sinfo;
+ u64 num_bytes;
+ u64 used;
+ u64 alloc_target;
+ int retries = 0;
+
+ /* get the space info for where the metadata will live */
+ alloc_target = btrfs_get_alloc_profile(root, 0);
+ meta_sinfo = __find_space_info(info, alloc_target);
+
+ num_bytes = calculate_bytes_needed(root, num_items);
+again:
+ spin_lock(&meta_sinfo->lock);
+
+ if (unlikely(!meta_sinfo->bytes_root))
+ meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
+
+ if (!retries)
+ meta_sinfo->bytes_may_use += num_bytes;
+
+ used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+ meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+ meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+ meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
+
+ if (used > meta_sinfo->total_bytes) {
+ retries++;
+ if (retries == 1) {
+ if (maybe_allocate_chunk(root, meta_sinfo))
+ goto again;
+ retries++;
+ } else {
+ spin_unlock(&meta_sinfo->lock);
+ }
+
+ if (retries == 2) {
+ btrfs_start_delalloc_inodes(root);
+ btrfs_wait_ordered_extents(root, 0);
goto again;
}
+ spin_lock(&meta_sinfo->lock);
+ meta_sinfo->bytes_may_use -= num_bytes;
+ spin_unlock(&meta_sinfo->lock);
+
+ dump_space_info(meta_sinfo, 0, 0);
return -ENOSPC;
}
+
+ check_force_delalloc(meta_sinfo);
spin_unlock(&meta_sinfo->lock);
return 0;
@@ -2649,13 +3126,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
data_sinfo = BTRFS_I(inode)->space_info;
+ if (!data_sinfo)
+ goto alloc;
+
again:
/* make sure we have enough space to handle the data first */
spin_lock(&data_sinfo->lock);
if (data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
- data_sinfo->bytes_may_use < bytes) {
+ data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
struct btrfs_trans_handle *trans;
/*
@@ -2667,7 +3147,7 @@ again:
data_sinfo->force_alloc = 1;
spin_unlock(&data_sinfo->lock);
-
+alloc:
alloc_target = btrfs_get_alloc_profile(root, 1);
trans = btrfs_start_transaction(root, 1);
if (!trans)
@@ -2679,12 +3159,17 @@ again:
btrfs_end_transaction(trans, root);
if (ret)
return ret;
+
+ if (!data_sinfo) {
+ btrfs_set_inode_space_info(root, inode);
+ data_sinfo = BTRFS_I(inode)->space_info;
+ }
goto again;
}
spin_unlock(&data_sinfo->lock);
/* commit the current transaction and try again */
- if (!committed) {
+ if (!committed && !root->fs_info->open_ioctl_trans) {
committed = 1;
trans = btrfs_join_transaction(root, 1);
if (!trans)
@@ -2697,7 +3182,7 @@ again:
printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
", %llu bytes_used, %llu bytes_reserved, "
- "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
+ "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
"%llu total\n", (unsigned long long)bytes,
(unsigned long long)data_sinfo->bytes_delalloc,
(unsigned long long)data_sinfo->bytes_used,
@@ -2712,7 +3197,7 @@ again:
BTRFS_I(inode)->reserved_bytes += bytes;
spin_unlock(&data_sinfo->lock);
- return btrfs_check_metadata_free_space(root);
+ return 0;
}
/*
@@ -2811,17 +3296,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
BUG_ON(!space_info);
spin_lock(&space_info->lock);
- if (space_info->force_alloc) {
+ if (space_info->force_alloc)
force = 1;
- space_info->force_alloc = 0;
- }
if (space_info->full) {
spin_unlock(&space_info->lock);
goto out;
}
thresh = space_info->total_bytes - space_info->bytes_readonly;
- thresh = div_factor(thresh, 6);
+ thresh = div_factor(thresh, 8);
if (!force &&
(space_info->bytes_used + space_info->bytes_pinned +
space_info->bytes_reserved + alloc_bytes) < thresh) {
@@ -2835,7 +3318,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
* we keep a reasonable number of metadata chunks allocated in the
* FS as well.
*/
- if (flags & BTRFS_BLOCK_GROUP_DATA) {
+ if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
fs_info->data_chunk_allocations++;
if (!(fs_info->data_chunk_allocations %
fs_info->metadata_ratio))
@@ -2843,8 +3326,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
}
ret = btrfs_alloc_chunk(trans, extent_root, flags);
+ spin_lock(&space_info->lock);
if (ret)
space_info->full = 1;
+ space_info->force_alloc = 0;
+ spin_unlock(&space_info->lock);
out:
mutex_unlock(&extent_root->fs_info->chunk_mutex);
return ret;
@@ -2893,10 +3379,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
num_bytes = min(total, cache->key.offset - byte_in_group);
if (alloc) {
old_val += num_bytes;
+ btrfs_set_block_group_used(&cache->item, old_val);
+ cache->reserved -= num_bytes;
cache->space_info->bytes_used += num_bytes;
+ cache->space_info->bytes_reserved -= num_bytes;
if (cache->ro)
cache->space_info->bytes_readonly -= num_bytes;
- btrfs_set_block_group_used(&cache->item, old_val);
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
} else {
@@ -2941,110 +3429,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
return bytenr;
}
-int btrfs_update_pinned_extents(struct btrfs_root *root,
- u64 bytenr, u64 num, int pin)
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent(struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, int reserved)
{
- u64 len;
- struct btrfs_block_group_cache *cache;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_group_cache *cache;
+
+ cache = btrfs_lookup_block_group(fs_info, bytenr);
+ BUG_ON(!cache);
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->pinned += num_bytes;
+ cache->space_info->bytes_pinned += num_bytes;
+ if (reserved) {
+ cache->reserved -= num_bytes;
+ cache->space_info->bytes_reserved -= num_bytes;
+ }
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+
+ btrfs_put_block_group(cache);
+
+ set_extent_dirty(fs_info->pinned_extents,
+ bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+ return 0;
+}
- if (pin) {
- set_extent_dirty(&fs_info->pinned_extents,
- bytenr, bytenr + num - 1, GFP_NOFS);
+static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int reserve)
+{
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ if (reserve) {
+ cache->reserved += num_bytes;
+ cache->space_info->bytes_reserved += num_bytes;
} else {
- clear_extent_dirty(&fs_info->pinned_extents,
- bytenr, bytenr + num - 1, GFP_NOFS);
- }
-
- while (num > 0) {
- cache = btrfs_lookup_block_group(fs_info, bytenr);
- BUG_ON(!cache);
- len = min(num, cache->key.offset -
- (bytenr - cache->key.objectid));
- if (pin) {
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- cache->pinned += len;
- cache->space_info->bytes_pinned += len;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
- fs_info->total_pinned += len;
- } else {
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- cache->pinned -= len;
- cache->space_info->bytes_pinned -= len;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
- fs_info->total_pinned -= len;
- if (cache->cached)
- btrfs_add_free_space(cache, bytenr, len);
- }
- btrfs_put_block_group(cache);
- bytenr += len;
- num -= len;
+ cache->reserved -= num_bytes;
+ cache->space_info->bytes_reserved -= num_bytes;
}
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
return 0;
}
-static int update_reserved_extents(struct btrfs_root *root,
- u64 bytenr, u64 num, int reserve)
+int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
{
- u64 len;
- struct btrfs_block_group_cache *cache;
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_caching_control *next;
+ struct btrfs_caching_control *caching_ctl;
+ struct btrfs_block_group_cache *cache;
- while (num > 0) {
- cache = btrfs_lookup_block_group(fs_info, bytenr);
- BUG_ON(!cache);
- len = min(num, cache->key.offset -
- (bytenr - cache->key.objectid));
+ down_write(&fs_info->extent_commit_sem);
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- if (reserve) {
- cache->reserved += len;
- cache->space_info->bytes_reserved += len;
+ list_for_each_entry_safe(caching_ctl, next,
+ &fs_info->caching_block_groups, list) {
+ cache = caching_ctl->block_group;
+ if (block_group_cache_done(cache)) {
+ cache->last_byte_to_unpin = (u64)-1;
+ list_del_init(&caching_ctl->list);
+ put_caching_control(caching_ctl);
} else {
- cache->reserved -= len;
- cache->space_info->bytes_reserved -= len;
+ cache->last_byte_to_unpin = caching_ctl->progress;
}
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
- btrfs_put_block_group(cache);
- bytenr += len;
- num -= len;
}
+
+ if (fs_info->pinned_extents == &fs_info->freed_extents[0])
+ fs_info->pinned_extents = &fs_info->freed_extents[1];
+ else
+ fs_info->pinned_extents = &fs_info->freed_extents[0];
+
+ up_write(&fs_info->extent_commit_sem);
return 0;
}
-int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
+static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
{
- u64 last = 0;
- u64 start;
- u64 end;
- struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
- int ret;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct btrfs_block_group_cache *cache = NULL;
+ u64 len;
- while (1) {
- ret = find_first_extent_bit(pinned_extents, last,
- &start, &end, EXTENT_DIRTY);
- if (ret)
- break;
- set_extent_dirty(copy, start, end, GFP_NOFS);
- last = end + 1;
+ while (start <= end) {
+ if (!cache ||
+ start >= cache->key.objectid + cache->key.offset) {
+ if (cache)
+ btrfs_put_block_group(cache);
+ cache = btrfs_lookup_block_group(fs_info, start);
+ BUG_ON(!cache);
+ }
+
+ len = cache->key.objectid + cache->key.offset - start;
+ len = min(len, end + 1 - start);
+
+ if (start < cache->last_byte_to_unpin) {
+ len = min(len, cache->last_byte_to_unpin - start);
+ btrfs_add_free_space(cache, start, len);
+ }
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ cache->pinned -= len;
+ cache->space_info->bytes_pinned -= len;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+
+ start += len;
}
+
+ if (cache)
+ btrfs_put_block_group(cache);
return 0;
}
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_io_tree *unpin)
+ struct btrfs_root *root)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct extent_io_tree *unpin;
u64 start;
u64 end;
int ret;
+ if (fs_info->pinned_extents == &fs_info->freed_extents[0])
+ unpin = &fs_info->freed_extents[1];
+ else
+ unpin = &fs_info->freed_extents[0];
+
while (1) {
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY);
@@ -3053,19 +3567,19 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
ret = btrfs_discard_extent(root, start, end + 1 - start);
- /* unlocks the pinned mutex */
- btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
clear_extent_dirty(unpin, start, end, GFP_NOFS);
-
+ unpin_extent_range(root, start, end);
cond_resched();
}
+
return ret;
}
static int pin_down_bytes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- u64 bytenr, u64 num_bytes, int is_data,
+ u64 bytenr, u64 num_bytes,
+ int is_data, int reserved,
struct extent_buffer **must_clean)
{
int err = 0;
@@ -3097,15 +3611,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
}
free_extent_buffer(buf);
pinit:
- btrfs_set_path_blocking(path);
+ if (path)
+ btrfs_set_path_blocking(path);
/* unlocks the pinned mutex */
- btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+ btrfs_pin_extent(root, bytenr, num_bytes, reserved);
BUG_ON(err < 0);
return 0;
}
-
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -3279,7 +3793,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
ret = pin_down_bytes(trans, root, path, bytenr,
- num_bytes, is_data, &must_clean);
+ num_bytes, is_data, 0, &must_clean);
if (ret > 0)
mark_free = 1;
BUG_ON(ret < 0);
@@ -3410,8 +3924,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
/* unlocks the pinned mutex */
- btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
- update_reserved_extents(root, bytenr, num_bytes, 0);
+ btrfs_pin_extent(root, bytenr, num_bytes, 1);
ret = 0;
} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
@@ -3437,6 +3950,59 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)
}
/*
+ * when we wait for progress in the block group caching, its because
+ * our allocation attempt failed at least once. So, we must sleep
+ * and let some progress happen before we try again.
+ *
+ * This function will sleep at least once waiting for new free space to
+ * show up, and then it will check the block group free space numbers
+ * for our min num_bytes. Another option is to have it go ahead
+ * and look in the rbtree for a free extent of a given size, but this
+ * is a good start.
+ */
+static noinline int
+wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+ u64 num_bytes)
+{
+ struct btrfs_caching_control *caching_ctl;
+ DEFINE_WAIT(wait);
+
+ caching_ctl = get_caching_control(cache);
+ if (!caching_ctl)
+ return 0;
+
+ wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
+ (cache->free_space >= num_bytes));
+
+ put_caching_control(caching_ctl);
+ return 0;
+}
+
+static noinline int
+wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+ struct btrfs_caching_control *caching_ctl;
+ DEFINE_WAIT(wait);
+
+ caching_ctl = get_caching_control(cache);
+ if (!caching_ctl)
+ return 0;
+
+ wait_event(caching_ctl->wait, block_group_cache_done(cache));
+
+ put_caching_control(caching_ctl);
+ return 0;
+}
+
+enum btrfs_loop_type {
+ LOOP_CACHED_ONLY = 0,
+ LOOP_CACHING_NOWAIT = 1,
+ LOOP_CACHING_WAIT = 2,
+ LOOP_ALLOC_CHUNK = 3,
+ LOOP_NO_EMPTY_SIZE = 4,
+};
+
+/*
* walks the btree of allocated extents and find a hole of a given size.
* The key ins is changed to record the hole:
* ins->objectid == block start
@@ -3461,6 +4027,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_space_info *space_info;
int last_ptr_loop = 0;
int loop = 0;
+ bool found_uncached_bg = false;
+ bool failed_cluster_refill = false;
WARN_ON(num_bytes < root->sectorsize);
btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3492,15 +4060,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
search_start = max(search_start, first_logical_byte(root, 0));
search_start = max(search_start, hint_byte);
- if (!last_ptr) {
+ if (!last_ptr)
empty_cluster = 0;
- loop = 1;
- }
if (search_start == hint_byte) {
block_group = btrfs_lookup_block_group(root->fs_info,
search_start);
- if (block_group && block_group_bits(block_group, data)) {
+ /*
+ * we don't want to use the block group if it doesn't match our
+ * allocation bits, or if its not cached.
+ */
+ if (block_group && block_group_bits(block_group, data) &&
+ block_group_cache_done(block_group)) {
down_read(&space_info->groups_sem);
if (list_empty(&block_group->list) ||
block_group->ro) {
@@ -3523,25 +4094,48 @@ search:
down_read(&space_info->groups_sem);
list_for_each_entry(block_group, &space_info->block_groups, list) {
u64 offset;
+ int cached;
atomic_inc(&block_group->count);
search_start = block_group->key.objectid;
have_block_group:
- if (unlikely(!block_group->cached)) {
- mutex_lock(&block_group->cache_mutex);
- ret = cache_block_group(root, block_group);
- mutex_unlock(&block_group->cache_mutex);
- if (ret) {
- btrfs_put_block_group(block_group);
- break;
+ if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
+ /*
+ * we want to start caching kthreads, but not too many
+ * right off the bat so we don't overwhelm the system,
+ * so only start them if there are less than 2 and we're
+ * in the initial allocation phase.
+ */
+ if (loop > LOOP_CACHING_NOWAIT ||
+ atomic_read(&space_info->caching_threads) < 2) {
+ ret = cache_block_group(block_group);
+ BUG_ON(ret);
}
}
+ cached = block_group_cache_done(block_group);
+ if (unlikely(!cached)) {
+ found_uncached_bg = true;
+
+ /* if we only want cached bgs, loop */
+ if (loop == LOOP_CACHED_ONLY)
+ goto loop;
+ }
+
if (unlikely(block_group->ro))
goto loop;
- if (last_ptr) {
+ /*
+ * Ok we want to try and use the cluster allocator, so lets look
+ * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
+ * have tried the cluster allocator plenty of times at this
+ * point and not have found anything, so we are likely way too
+ * fragmented for the clustering stuff to find anything, so lets
+ * just skip it and let the allocator find whatever block it can
+ * find
+ */
+ if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
/*
* the refill lock keeps out other
* people trying to start a new cluster
@@ -3616,29 +4210,40 @@ refill_cluster:
spin_unlock(&last_ptr->refill_lock);
goto checks;
}
+ } else if (!cached && loop > LOOP_CACHING_NOWAIT
+ && !failed_cluster_refill) {
+ spin_unlock(&last_ptr->refill_lock);
+
+ failed_cluster_refill = true;
+ wait_block_group_cache_progress(block_group,
+ num_bytes + empty_cluster + empty_size);
+ goto have_block_group;
}
+
/*
* at this point we either didn't find a cluster
* or we weren't able to allocate a block from our
* cluster. Free the cluster we've been trying
* to use, and go to the next block group
*/
- if (loop < 2) {
- btrfs_return_cluster_to_free_space(NULL,
- last_ptr);
- spin_unlock(&last_ptr->refill_lock);
- goto loop;
- }
+ btrfs_return_cluster_to_free_space(NULL, last_ptr);
spin_unlock(&last_ptr->refill_lock);
+ goto loop;
}
offset = btrfs_find_space_for_alloc(block_group, search_start,
num_bytes, empty_size);
- if (!offset)
+ if (!offset && (cached || (!cached &&
+ loop == LOOP_CACHING_NOWAIT))) {
goto loop;
+ } else if (!offset && (!cached &&
+ loop > LOOP_CACHING_NOWAIT)) {
+ wait_block_group_cache_progress(block_group,
+ num_bytes + empty_size);
+ goto have_block_group;
+ }
checks:
search_start = stripe_align(root, offset);
-
/* move on to the next group */
if (search_start + num_bytes >= search_end) {
btrfs_add_free_space(block_group, offset, num_bytes);
@@ -3677,20 +4282,36 @@ checks:
search_start - offset);
BUG_ON(offset > search_start);
+ update_reserved_extents(block_group, num_bytes, 1);
+
/* we are all good, lets return */
break;
loop:
+ failed_cluster_refill = false;
btrfs_put_block_group(block_group);
}
up_read(&space_info->groups_sem);
- /* loop == 0, try to find a clustered alloc in every block group
- * loop == 1, try again after forcing a chunk allocation
- * loop == 2, set empty_size and empty_cluster to 0 and try again
+ /* LOOP_CACHED_ONLY, only search fully cached block groups
+ * LOOP_CACHING_NOWAIT, search partially cached block groups, but
+ * dont wait foR them to finish caching
+ * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+ * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+ * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+ * again
*/
- if (!ins->objectid && loop < 3 &&
- (empty_size || empty_cluster || allowed_chunk_alloc)) {
- if (loop >= 2) {
+ if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
+ (found_uncached_bg || empty_size || empty_cluster ||
+ allowed_chunk_alloc)) {
+ if (found_uncached_bg) {
+ found_uncached_bg = false;
+ if (loop < LOOP_CACHING_WAIT) {
+ loop++;
+ goto search;
+ }
+ }
+
+ if (loop == LOOP_ALLOC_CHUNK) {
empty_size = 0;
empty_cluster = 0;
}
@@ -3703,7 +4324,7 @@ loop:
space_info->force_alloc = 1;
}
- if (loop < 3) {
+ if (loop < LOOP_NO_EMPTY_SIZE) {
loop++;
goto search;
}
@@ -3724,21 +4345,32 @@ loop:
return ret;
}
-static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
+static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
+ int dump_block_groups)
{
struct btrfs_block_group_cache *cache;
+ spin_lock(&info->lock);
printk(KERN_INFO "space_info has %llu free, is %sfull\n",
(unsigned long long)(info->total_bytes - info->bytes_used -
- info->bytes_pinned - info->bytes_reserved),
+ info->bytes_pinned - info->bytes_reserved -
+ info->bytes_super),
(info->full) ? "" : "not ");
printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
- " may_use=%llu, used=%llu\n",
+ " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
+ "\n",
(unsigned long long)info->total_bytes,
(unsigned long long)info->bytes_pinned,
(unsigned long long)info->bytes_delalloc,
(unsigned long long)info->bytes_may_use,
- (unsigned long long)info->bytes_used);
+ (unsigned long long)info->bytes_used,
+ (unsigned long long)info->bytes_root,
+ (unsigned long long)info->bytes_super,
+ (unsigned long long)info->bytes_reserved);
+ spin_unlock(&info->lock);
+
+ if (!dump_block_groups)
+ return;
down_read(&info->groups_sem);
list_for_each_entry(cache, &info->block_groups, list) {
@@ -3756,12 +4388,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
up_read(&info->groups_sem);
}
-static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 num_bytes, u64 min_alloc_size,
- u64 empty_size, u64 hint_byte,
- u64 search_end, struct btrfs_key *ins,
- u64 data)
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 num_bytes, u64 min_alloc_size,
+ u64 empty_size, u64 hint_byte,
+ u64 search_end, struct btrfs_key *ins,
+ u64 data)
{
int ret;
u64 search_start = 0;
@@ -3799,15 +4431,14 @@ again:
num_bytes, data, 1);
goto again;
}
- if (ret) {
+ if (ret == -ENOSPC) {
struct btrfs_space_info *sinfo;
sinfo = __find_space_info(root->fs_info, data);
printk(KERN_ERR "btrfs allocation failed flags %llu, "
"wanted %llu\n", (unsigned long long)data,
(unsigned long long)num_bytes);
- dump_space_info(sinfo, num_bytes);
- BUG();
+ dump_space_info(sinfo, num_bytes, 1);
}
return ret;
@@ -3828,27 +4459,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
ret = btrfs_discard_extent(root, start, len);
btrfs_add_free_space(cache, start, len);
+ update_reserved_extents(cache, len, 0);
btrfs_put_block_group(cache);
- update_reserved_extents(root, start, len, 0);
return ret;
}
-int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 num_bytes, u64 min_alloc_size,
- u64 empty_size, u64 hint_byte,
- u64 search_end, struct btrfs_key *ins,
- u64 data)
-{
- int ret;
- ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
- empty_size, hint_byte, search_end, ins,
- data);
- update_reserved_extents(root, ins->objectid, ins->offset, 1);
- return ret;
-}
-
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
@@ -4005,15 +4621,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
{
int ret;
struct btrfs_block_group_cache *block_group;
+ struct btrfs_caching_control *caching_ctl;
+ u64 start = ins->objectid;
+ u64 num_bytes = ins->offset;
block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
- mutex_lock(&block_group->cache_mutex);
- cache_block_group(root, block_group);
- mutex_unlock(&block_group->cache_mutex);
+ cache_block_group(block_group);
+ caching_ctl = get_caching_control(block_group);
- ret = btrfs_remove_free_space(block_group, ins->objectid,
- ins->offset);
- BUG_ON(ret);
+ if (!caching_ctl) {
+ BUG_ON(!block_group_cache_done(block_group));
+ ret = btrfs_remove_free_space(block_group, start, num_bytes);
+ BUG_ON(ret);
+ } else {
+ mutex_lock(&caching_ctl->mutex);
+
+ if (start >= caching_ctl->progress) {
+ ret = add_excluded_extent(root, start, num_bytes);
+ BUG_ON(ret);
+ } else if (start + num_bytes <= caching_ctl->progress) {
+ ret = btrfs_remove_free_space(block_group,
+ start, num_bytes);
+ BUG_ON(ret);
+ } else {
+ num_bytes = caching_ctl->progress - start;
+ ret = btrfs_remove_free_space(block_group,
+ start, num_bytes);
+ BUG_ON(ret);
+
+ start = caching_ctl->progress;
+ num_bytes = ins->objectid + ins->offset -
+ caching_ctl->progress;
+ ret = add_excluded_extent(root, start, num_bytes);
+ BUG_ON(ret);
+ }
+
+ mutex_unlock(&caching_ctl->mutex);
+ put_caching_control(caching_ctl);
+ }
+
+ update_reserved_extents(block_group, ins->offset, 1);
btrfs_put_block_group(block_group);
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
@@ -4037,10 +4684,11 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
int ret;
u64 flags = 0;
- ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
- empty_size, hint_byte, search_end,
- ins, 0);
- BUG_ON(ret);
+ ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
+ empty_size, hint_byte, search_end,
+ ins, 0);
+ if (ret)
+ return ret;
if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
if (parent == 0)
@@ -4049,7 +4697,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
} else
BUG_ON(parent > 0);
- update_reserved_extents(root, ins->objectid, ins->offset, 1);
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
struct btrfs_delayed_extent_op *extent_op;
extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
@@ -4128,687 +4775,642 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
return buf;
}
-int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct extent_buffer *leaf)
+struct walk_control {
+ u64 refs[BTRFS_MAX_LEVEL];
+ u64 flags[BTRFS_MAX_LEVEL];
+ struct btrfs_key update_progress;
+ int stage;
+ int level;
+ int shared_level;
+ int update_ref;
+ int keep_locks;
+ int reada_slot;
+ int reada_count;
+};
+
+#define DROP_REFERENCE 1
+#define UPDATE_BACKREF 2
+
+static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct walk_control *wc,
+ struct btrfs_path *path)
{
- u64 disk_bytenr;
- u64 num_bytes;
- struct btrfs_key key;
- struct btrfs_file_extent_item *fi;
+ u64 bytenr;
+ u64 generation;
+ u64 refs;
+ u64 last = 0;
u32 nritems;
- int i;
+ u32 blocksize;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
int ret;
+ int slot;
+ int nread = 0;
- BUG_ON(!btrfs_is_leaf(leaf));
- nritems = btrfs_header_nritems(leaf);
-
- for (i = 0; i < nritems; i++) {
- cond_resched();
- btrfs_item_key_to_cpu(leaf, &key, i);
+ if (path->slots[wc->level] < wc->reada_slot) {
+ wc->reada_count = wc->reada_count * 2 / 3;
+ wc->reada_count = max(wc->reada_count, 2);
+ } else {
+ wc->reada_count = wc->reada_count * 3 / 2;
+ wc->reada_count = min_t(int, wc->reada_count,
+ BTRFS_NODEPTRS_PER_BLOCK(root));
+ }
- /* only extents have references, skip everything else */
- if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
- continue;
+ eb = path->nodes[wc->level];
+ nritems = btrfs_header_nritems(eb);
+ blocksize = btrfs_level_size(root, wc->level - 1);
- fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+ for (slot = path->slots[wc->level]; slot < nritems; slot++) {
+ if (nread >= wc->reada_count)
+ break;
- /* inline extents live in the btree, they don't have refs */
- if (btrfs_file_extent_type(leaf, fi) ==
- BTRFS_FILE_EXTENT_INLINE)
- continue;
+ cond_resched();
+ bytenr = btrfs_node_blockptr(eb, slot);
+ generation = btrfs_node_ptr_generation(eb, slot);
- disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ if (slot == path->slots[wc->level])
+ goto reada;
- /* holes don't have refs */
- if (disk_bytenr == 0)
+ if (wc->stage == UPDATE_BACKREF &&
+ generation <= root->root_key.offset)
continue;
- num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
- ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
- leaf->start, 0, key.objectid, 0);
- BUG_ON(ret);
+ if (wc->stage == DROP_REFERENCE) {
+ ret = btrfs_lookup_extent_info(trans, root,
+ bytenr, blocksize,
+ &refs, NULL);
+ BUG_ON(ret);
+ BUG_ON(refs == 0);
+ if (refs == 1)
+ goto reada;
+
+ if (!wc->update_ref ||
+ generation <= root->root_key.offset)
+ continue;
+ btrfs_node_key_to_cpu(eb, &key, slot);
+ ret = btrfs_comp_cpu_keys(&key,
+ &wc->update_progress);
+ if (ret < 0)
+ continue;
+ }
+reada:
+ ret = readahead_tree_block(root, bytenr, blocksize,
+ generation);
+ if (ret)
+ break;
+ last = bytenr + blocksize;
+ nread++;
}
- return 0;
+ wc->reada_slot = slot;
}
-#if 0
-
-static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_leaf_ref *ref)
+/*
+ * hepler to process tree block while walking down the tree.
+ *
+ * when wc->stage == UPDATE_BACKREF, this function updates
+ * back refs for pointers in the block.
+ *
+ * NOTE: return value 1 means we should stop walking down.
+ */
+static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc)
{
- int i;
+ int level = wc->level;
+ struct extent_buffer *eb = path->nodes[level];
+ u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
int ret;
- struct btrfs_extent_info *info;
- struct refsort *sorted;
-
- if (ref->nritems == 0)
- return 0;
- sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
- for (i = 0; i < ref->nritems; i++) {
- sorted[i].bytenr = ref->extents[i].bytenr;
- sorted[i].slot = i;
- }
- sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
+ if (wc->stage == UPDATE_BACKREF &&
+ btrfs_header_owner(eb) != root->root_key.objectid)
+ return 1;
/*
- * the items in the ref were sorted when the ref was inserted
- * into the ref cache, so this is already in order
+ * when reference count of tree block is 1, it won't increase
+ * again. once full backref flag is set, we never clear it.
*/
- for (i = 0; i < ref->nritems; i++) {
- info = ref->extents + sorted[i].slot;
- ret = btrfs_free_extent(trans, root, info->bytenr,
- info->num_bytes, ref->bytenr,
- ref->owner, ref->generation,
- info->objectid, 0);
-
- atomic_inc(&root->fs_info->throttle_gen);
- wake_up(&root->fs_info->transaction_throttle);
- cond_resched();
-
+ if ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
+ (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag))) {
+ BUG_ON(!path->locks[level]);
+ ret = btrfs_lookup_extent_info(trans, root,
+ eb->start, eb->len,
+ &wc->refs[level],
+ &wc->flags[level]);
BUG_ON(ret);
- info++;
+ BUG_ON(wc->refs[level] == 0);
}
- kfree(sorted);
- return 0;
-}
-
-
-static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 start,
- u64 len, u32 *refs)
-{
- int ret;
-
- ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
- BUG_ON(ret);
-
-#if 0 /* some debugging code in case we see problems here */
- /* if the refs count is one, it won't get increased again. But
- * if the ref count is > 1, someone may be decreasing it at
- * the same time we are.
- */
- if (*refs != 1) {
- struct extent_buffer *eb = NULL;
- eb = btrfs_find_create_tree_block(root, start, len);
- if (eb)
- btrfs_tree_lock(eb);
-
- mutex_lock(&root->fs_info->alloc_mutex);
- ret = lookup_extent_ref(NULL, root, start, len, refs);
- BUG_ON(ret);
- mutex_unlock(&root->fs_info->alloc_mutex);
+ if (wc->stage == DROP_REFERENCE) {
+ if (wc->refs[level] > 1)
+ return 1;
- if (eb) {
+ if (path->locks[level] && !wc->keep_locks) {
btrfs_tree_unlock(eb);
- free_extent_buffer(eb);
- }
- if (*refs == 1) {
- printk(KERN_ERR "btrfs block %llu went down to one "
- "during drop_snap\n", (unsigned long long)start);
+ path->locks[level] = 0;
}
+ return 0;
+ }
+ /* wc->stage == UPDATE_BACKREF */
+ if (!(wc->flags[level] & flag)) {
+ BUG_ON(!path->locks[level]);
+ ret = btrfs_inc_ref(trans, root, eb, 1);
+ BUG_ON(ret);
+ ret = btrfs_dec_ref(trans, root, eb, 0);
+ BUG_ON(ret);
+ ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
+ eb->len, flag, 0);
+ BUG_ON(ret);
+ wc->flags[level] |= flag;
}
-#endif
- cond_resched();
- return ret;
+ /*
+ * the block is shared by multiple trees, so it's not good to
+ * keep the tree lock
+ */
+ if (path->locks[level] && level > 0) {
+ btrfs_tree_unlock(eb);
+ path->locks[level] = 0;
+ }
+ return 0;
}
-
/*
- * this is used while deleting old snapshots, and it drops the refs
- * on a whole subtree starting from a level 1 node.
+ * hepler to process tree block pointer.
*
- * The idea is to sort all the leaf pointers, and then drop the
- * ref on all the leaves in order. Most of the time the leaves
- * will have ref cache entries, so no leaf IOs will be required to
- * find the extents they have references on.
+ * when wc->stage == DROP_REFERENCE, this function checks
+ * reference count of the block pointed to. if the block
+ * is shared and we need update back refs for the subtree
+ * rooted at the block, this function changes wc->stage to
+ * UPDATE_BACKREF. if the block is shared and there is no
+ * need to update back, this function drops the reference
+ * to the block.
*
- * For each leaf, any references it has are also dropped in order
- *
- * This ends up dropping the references in something close to optimal
- * order for reading and modifying the extent allocation tree.
+ * NOTE: return value 1 means we should stop walking down.
*/
-static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path)
+static noinline int do_walk_down(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc)
{
u64 bytenr;
- u64 root_owner;
- u64 root_gen;
- struct extent_buffer *eb = path->nodes[1];
- struct extent_buffer *leaf;
- struct btrfs_leaf_ref *ref;
- struct refsort *sorted = NULL;
- int nritems = btrfs_header_nritems(eb);
- int ret;
- int i;
- int refi = 0;
- int slot = path->slots[1];
- u32 blocksize = btrfs_level_size(root, 0);
- u32 refs;
-
- if (nritems == 0)
- goto out;
-
- root_owner = btrfs_header_owner(eb);
- root_gen = btrfs_header_generation(eb);
- sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
-
- /*
- * step one, sort all the leaf pointers so we don't scribble
- * randomly into the extent allocation tree
- */
- for (i = slot; i < nritems; i++) {
- sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
- sorted[refi].slot = i;
- refi++;
- }
+ u64 generation;
+ u64 parent;
+ u32 blocksize;
+ struct btrfs_key key;
+ struct extent_buffer *next;
+ int level = wc->level;
+ int reada = 0;
+ int ret = 0;
+ generation = btrfs_node_ptr_generation(path->nodes[level],
+ path->slots[level]);
/*
- * nritems won't be zero, but if we're picking up drop_snapshot
- * after a crash, slot might be > 0, so double check things
- * just in case.
+ * if the lower level block was created before the snapshot
+ * was created, we know there is no need to update back refs
+ * for the subtree
*/
- if (refi == 0)
- goto out;
-
- sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+ if (wc->stage == UPDATE_BACKREF &&
+ generation <= root->root_key.offset)
+ return 1;
- /*
- * the first loop frees everything the leaves point to
- */
- for (i = 0; i < refi; i++) {
- u64 ptr_gen;
+ bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
+ blocksize = btrfs_level_size(root, level - 1);
- bytenr = sorted[i].bytenr;
+ next = btrfs_find_tree_block(root, bytenr, blocksize);
+ if (!next) {
+ next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ reada = 1;
+ }
+ btrfs_tree_lock(next);
+ btrfs_set_lock_blocking(next);
- /*
- * check the reference count on this leaf. If it is > 1
- * we just decrement it below and don't update any
- * of the refs the leaf points to.
- */
- ret = drop_snap_lookup_refcount(trans, root, bytenr,
- blocksize, &refs);
+ if (wc->stage == DROP_REFERENCE) {
+ ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+ &wc->refs[level - 1],
+ &wc->flags[level - 1]);
BUG_ON(ret);
- if (refs != 1)
- continue;
+ BUG_ON(wc->refs[level - 1] == 0);
- ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
+ if (wc->refs[level - 1] > 1) {
+ if (!wc->update_ref ||
+ generation <= root->root_key.offset)
+ goto skip;
- /*
- * the leaf only had one reference, which means the
- * only thing pointing to this leaf is the snapshot
- * we're deleting. It isn't possible for the reference
- * count to increase again later
- *
- * The reference cache is checked for the leaf,
- * and if found we'll be able to drop any refs held by
- * the leaf without needing to read it in.
- */
- ref = btrfs_lookup_leaf_ref(root, bytenr);
- if (ref && ref->generation != ptr_gen) {
- btrfs_free_leaf_ref(root, ref);
- ref = NULL;
- }
- if (ref) {
- ret = cache_drop_leaf_ref(trans, root, ref);
- BUG_ON(ret);
- btrfs_remove_leaf_ref(root, ref);
- btrfs_free_leaf_ref(root, ref);
- } else {
- /*
- * the leaf wasn't in the reference cache, so
- * we have to read it.
- */
- leaf = read_tree_block(root, bytenr, blocksize,
- ptr_gen);
- ret = btrfs_drop_leaf_ref(trans, root, leaf);
- BUG_ON(ret);
- free_extent_buffer(leaf);
+ btrfs_node_key_to_cpu(path->nodes[level], &key,
+ path->slots[level]);
+ ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
+ if (ret < 0)
+ goto skip;
+
+ wc->stage = UPDATE_BACKREF;
+ wc->shared_level = level - 1;
}
- atomic_inc(&root->fs_info->throttle_gen);
- wake_up(&root->fs_info->transaction_throttle);
- cond_resched();
}
- /*
- * run through the loop again to free the refs on the leaves.
- * This is faster than doing it in the loop above because
- * the leaves are likely to be clustered together. We end up
- * working in nice chunks on the extent allocation tree.
- */
- for (i = 0; i < refi; i++) {
- bytenr = sorted[i].bytenr;
- ret = btrfs_free_extent(trans, root, bytenr,
- blocksize, eb->start,
- root_owner, root_gen, 0, 1);
- BUG_ON(ret);
+ if (!btrfs_buffer_uptodate(next, generation)) {
+ btrfs_tree_unlock(next);
+ free_extent_buffer(next);
+ next = NULL;
+ }
- atomic_inc(&root->fs_info->throttle_gen);
- wake_up(&root->fs_info->transaction_throttle);
- cond_resched();
+ if (!next) {
+ if (reada && level == 1)
+ reada_walk_down(trans, root, wc, path);
+ next = read_tree_block(root, bytenr, blocksize, generation);
+ btrfs_tree_lock(next);
+ btrfs_set_lock_blocking(next);
}
-out:
- kfree(sorted);
- /*
- * update the path to show we've processed the entire level 1
- * node. This will get saved into the root's drop_snapshot_progress
- * field so these drops are not repeated again if this transaction
- * commits.
- */
- path->slots[1] = nritems;
+ level--;
+ BUG_ON(level != btrfs_header_level(next));
+ path->nodes[level] = next;
+ path->slots[level] = 0;
+ path->locks[level] = 1;
+ wc->level = level;
+ if (wc->level == 1)
+ wc->reada_slot = 0;
return 0;
+skip:
+ wc->refs[level - 1] = 0;
+ wc->flags[level - 1] = 0;
+
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+ parent = path->nodes[level]->start;
+ } else {
+ BUG_ON(root->root_key.objectid !=
+ btrfs_header_owner(path->nodes[level]));
+ parent = 0;
+ }
+
+ ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
+ root->root_key.objectid, level - 1, 0);
+ BUG_ON(ret);
+
+ btrfs_tree_unlock(next);
+ free_extent_buffer(next);
+ return 1;
}
/*
- * helper function for drop_snapshot, this walks down the tree dropping ref
- * counts as it goes.
+ * hepler to process tree block while walking up the tree.
+ *
+ * when wc->stage == DROP_REFERENCE, this function drops
+ * reference count on the block.
+ *
+ * when wc->stage == UPDATE_BACKREF, this function changes
+ * wc->stage back to DROP_REFERENCE if we changed wc->stage
+ * to UPDATE_BACKREF previously while processing the block.
+ *
+ * NOTE: return value 1 means we should stop walking up.
*/
-static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path, int *level)
+static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct walk_control *wc)
{
- u64 root_owner;
- u64 root_gen;
- u64 bytenr;
- u64 ptr_gen;
- struct extent_buffer *next;
- struct extent_buffer *cur;
- struct extent_buffer *parent;
- u32 blocksize;
- int ret;
- u32 refs;
-
- WARN_ON(*level < 0);
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
- ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
- path->nodes[*level]->len, &refs);
- BUG_ON(ret);
- if (refs > 1)
- goto out;
-
- /*
- * walk down to the last node level and free all the leaves
- */
- while (*level >= 0) {
- WARN_ON(*level < 0);
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
- cur = path->nodes[*level];
+ int ret = 0;
+ int level = wc->level;
+ struct extent_buffer *eb = path->nodes[level];
+ u64 parent = 0;
- if (btrfs_header_level(cur) != *level)
- WARN_ON(1);
+ if (wc->stage == UPDATE_BACKREF) {
+ BUG_ON(wc->shared_level < level);
+ if (level < wc->shared_level)
+ goto out;
- if (path->slots[*level] >=
- btrfs_header_nritems(cur))
- break;
+ ret = find_next_key(path, level + 1, &wc->update_progress);
+ if (ret > 0)
+ wc->update_ref = 0;
- /* the new code goes down to level 1 and does all the
- * leaves pointed to that node in bulk. So, this check
- * for level 0 will always be false.
- *
- * But, the disk format allows the drop_snapshot_progress
- * field in the root to leave things in a state where
- * a leaf will need cleaning up here. If someone crashes
- * with the old code and then boots with the new code,
- * we might find a leaf here.
- */
- if (*level == 0) {
- ret = btrfs_drop_leaf_ref(trans, root, cur);
- BUG_ON(ret);
- break;
- }
+ wc->stage = DROP_REFERENCE;
+ wc->shared_level = -1;
+ path->slots[level] = 0;
/*
- * once we get to level one, process the whole node
- * at once, including everything below it.
+ * check reference count again if the block isn't locked.
+ * we should start walking down the tree again if reference
+ * count is one.
*/
- if (*level == 1) {
- ret = drop_level_one_refs(trans, root, path);
+ if (!path->locks[level]) {
+ BUG_ON(level == 0);
+ btrfs_tree_lock(eb);
+ btrfs_set_lock_blocking(eb);
+ path->locks[level] = 1;
+
+ ret = btrfs_lookup_extent_info(trans, root,
+ eb->start, eb->len,
+ &wc->refs[level],
+ &wc->flags[level]);
BUG_ON(ret);
- break;
+ BUG_ON(wc->refs[level] == 0);
+ if (wc->refs[level] == 1) {
+ btrfs_tree_unlock(eb);
+ path->locks[level] = 0;
+ return 1;
+ }
}
+ }
- bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
- ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
- blocksize = btrfs_level_size(root, *level - 1);
-
- ret = drop_snap_lookup_refcount(trans, root, bytenr,
- blocksize, &refs);
- BUG_ON(ret);
+ /* wc->stage == DROP_REFERENCE */
+ BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
- /*
- * if there is more than one reference, we don't need
- * to read that node to drop any references it has. We
- * just drop the ref we hold on that node and move on to the
- * next slot in this level.
- */
- if (refs != 1) {
- parent = path->nodes[*level];
- root_owner = btrfs_header_owner(parent);
- root_gen = btrfs_header_generation(parent);
- path->slots[*level]++;
-
- ret = btrfs_free_extent(trans, root, bytenr,
- blocksize, parent->start,
- root_owner, root_gen,
- *level - 1, 1);
+ if (wc->refs[level] == 1) {
+ if (level == 0) {
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ ret = btrfs_dec_ref(trans, root, eb, 1);
+ else
+ ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret);
-
- atomic_inc(&root->fs_info->throttle_gen);
- wake_up(&root->fs_info->transaction_throttle);
- cond_resched();
-
- continue;
}
-
- /*
- * we need to keep freeing things in the next level down.
- * read the block and loop around to process it
- */
- next = read_tree_block(root, bytenr, blocksize, ptr_gen);
- WARN_ON(*level <= 0);
- if (path->nodes[*level-1])
- free_extent_buffer(path->nodes[*level-1]);
- path->nodes[*level-1] = next;
- *level = btrfs_header_level(next);
- path->slots[*level] = 0;
- cond_resched();
+ /* make block locked assertion in clean_tree_block happy */
+ if (!path->locks[level] &&
+ btrfs_header_generation(eb) == trans->transid) {
+ btrfs_tree_lock(eb);
+ btrfs_set_lock_blocking(eb);
+ path->locks[level] = 1;
+ }
+ clean_tree_block(trans, root, eb);
}
-out:
- WARN_ON(*level < 0);
- WARN_ON(*level >= BTRFS_MAX_LEVEL);
- if (path->nodes[*level] == root->node) {
- parent = path->nodes[*level];
- bytenr = path->nodes[*level]->start;
+ if (eb == root->node) {
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ parent = eb->start;
+ else
+ BUG_ON(root->root_key.objectid !=
+ btrfs_header_owner(eb));
} else {
- parent = path->nodes[*level + 1];
- bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
+ if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ parent = path->nodes[level + 1]->start;
+ else
+ BUG_ON(root->root_key.objectid !=
+ btrfs_header_owner(path->nodes[level + 1]));
}
- blocksize = btrfs_level_size(root, *level);
- root_owner = btrfs_header_owner(parent);
- root_gen = btrfs_header_generation(parent);
-
- /*
- * cleanup and free the reference on the last node
- * we processed
- */
- ret = btrfs_free_extent(trans, root, bytenr, blocksize,
- parent->start, root_owner, root_gen,
- *level, 1);
- free_extent_buffer(path->nodes[*level]);
- path->nodes[*level] = NULL;
-
- *level += 1;
+ ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
+ root->root_key.objectid, level, 0);
BUG_ON(ret);
-
- cond_resched();
- return 0;
+out:
+ wc->refs[level] = 0;
+ wc->flags[level] = 0;
+ return ret;
}
-#endif
-/*
- * helper function for drop_subtree, this function is similar to
- * walk_down_tree. The main difference is that it checks reference
- * counts while tree blocks are locked.
- */
static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct btrfs_path *path, int *level)
+ struct btrfs_path *path,
+ struct walk_control *wc)
{
- struct extent_buffer *next;
- struct extent_buffer *cur;
- struct extent_buffer *parent;
- u64 bytenr;
- u64 ptr_gen;
- u64 refs;
- u64 flags;
- u32 blocksize;
+ int level = wc->level;
int ret;
- cur = path->nodes[*level];
- ret = btrfs_lookup_extent_info(trans, root, cur->start, cur->len,
- &refs, &flags);
- BUG_ON(ret);
- if (refs > 1)
- goto out;
-
- BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
-
- while (*level >= 0) {
- cur = path->nodes[*level];
- if (*level == 0) {
- ret = btrfs_drop_leaf_ref(trans, root, cur);
- BUG_ON(ret);
- clean_tree_block(trans, root, cur);
- break;
- }
- if (path->slots[*level] >= btrfs_header_nritems(cur)) {
- clean_tree_block(trans, root, cur);
+ while (level >= 0) {
+ if (path->slots[level] >=
+ btrfs_header_nritems(path->nodes[level]))
break;
- }
- bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
- blocksize = btrfs_level_size(root, *level - 1);
- ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
+ ret = walk_down_proc(trans, root, path, wc);
+ if (ret > 0)
+ break;
- next = read_tree_block(root, bytenr, blocksize, ptr_gen);
- btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
+ if (level == 0)
+ break;
- ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
- &refs, &flags);
- BUG_ON(ret);
- if (refs > 1) {
- parent = path->nodes[*level];
- ret = btrfs_free_extent(trans, root, bytenr,
- blocksize, parent->start,
- btrfs_header_owner(parent),
- *level - 1, 0);
- BUG_ON(ret);
- path->slots[*level]++;
- btrfs_tree_unlock(next);
- free_extent_buffer(next);
+ ret = do_walk_down(trans, root, path, wc);
+ if (ret > 0) {
+ path->slots[level]++;
continue;
}
-
- BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
-
- *level = btrfs_header_level(next);
- path->nodes[*level] = next;
- path->slots[*level] = 0;
- path->locks[*level] = 1;
- cond_resched();
- }
-out:
- if (path->nodes[*level] == root->node)
- parent = path->nodes[*level];
- else
- parent = path->nodes[*level + 1];
- bytenr = path->nodes[*level]->start;
- blocksize = path->nodes[*level]->len;
-
- ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start,
- btrfs_header_owner(parent), *level, 0);
- BUG_ON(ret);
-
- if (path->locks[*level]) {
- btrfs_tree_unlock(path->nodes[*level]);
- path->locks[*level] = 0;
+ level = wc->level;
}
- free_extent_buffer(path->nodes[*level]);
- path->nodes[*level] = NULL;
- *level += 1;
- cond_resched();
return 0;
}
-/*
- * helper for dropping snapshots. This walks back up the tree in the path
- * to find the first node higher up where we haven't yet gone through
- * all the slots
- */
static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- int *level, int max_level)
+ struct walk_control *wc, int max_level)
{
- struct btrfs_root_item *root_item = &root->root_item;
- int i;
- int slot;
+ int level = wc->level;
int ret;
- for (i = *level; i < max_level && path->nodes[i]; i++) {
- slot = path->slots[i];
- if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
- /*
- * there is more work to do in this level.
- * Update the drop_progress marker to reflect
- * the work we've done so far, and then bump
- * the slot number
- */
- path->slots[i]++;
- WARN_ON(*level == 0);
- if (max_level == BTRFS_MAX_LEVEL) {
- btrfs_node_key(path->nodes[i],
- &root_item->drop_progress,
- path->slots[i]);
- root_item->drop_level = i;
- }
- *level = i;
+ path->slots[level] = btrfs_header_nritems(path->nodes[level]);
+ while (level < max_level && path->nodes[level]) {
+ wc->level = level;
+ if (path->slots[level] + 1 <
+ btrfs_header_nritems(path->nodes[level])) {
+ path->slots[level]++;
return 0;
} else {
- struct extent_buffer *parent;
-
- /*
- * this whole node is done, free our reference
- * on it and go up one level
- */
- if (path->nodes[*level] == root->node)
- parent = path->nodes[*level];
- else
- parent = path->nodes[*level + 1];
+ ret = walk_up_proc(trans, root, path, wc);
+ if (ret > 0)
+ return 0;
- clean_tree_block(trans, root, path->nodes[i]);
- ret = btrfs_free_extent(trans, root,
- path->nodes[i]->start,
- path->nodes[i]->len,
- parent->start,
- btrfs_header_owner(parent),
- *level, 0);
- BUG_ON(ret);
- if (path->locks[*level]) {
- btrfs_tree_unlock(path->nodes[i]);
- path->locks[i] = 0;
+ if (path->locks[level]) {
+ btrfs_tree_unlock(path->nodes[level]);
+ path->locks[level] = 0;
}
- free_extent_buffer(path->nodes[i]);
- path->nodes[i] = NULL;
- *level = i + 1;
+ free_extent_buffer(path->nodes[level]);
+ path->nodes[level] = NULL;
+ level++;
}
}
return 1;
}
/*
- * drop the reference count on the tree rooted at 'snap'. This traverses
- * the tree freeing any blocks that have a ref count of zero after being
- * decremented.
+ * drop a subvolume tree.
+ *
+ * this function traverses the tree freeing any blocks that only
+ * referenced by the tree.
+ *
+ * when a shared tree block is found. this function decreases its
+ * reference count by one. if update_ref is true, this function
+ * also make sure backrefs for the shared block and all lower level
+ * blocks are properly updated.
*/
-int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
- *root)
+int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
{
- int ret = 0;
- int wret;
- int level;
struct btrfs_path *path;
- int update_count;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *tree_root = root->fs_info->tree_root;
struct btrfs_root_item *root_item = &root->root_item;
+ struct walk_control *wc;
+ struct btrfs_key key;
+ int err = 0;
+ int ret;
+ int level;
path = btrfs_alloc_path();
BUG_ON(!path);
- level = btrfs_header_level(root->node);
+ wc = kzalloc(sizeof(*wc), GFP_NOFS);
+ BUG_ON(!wc);
+
+ trans = btrfs_start_transaction(tree_root, 1);
+
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+ level = btrfs_header_level(root->node);
path->nodes[level] = btrfs_lock_root_node(root);
btrfs_set_lock_blocking(path->nodes[level]);
path->slots[level] = 0;
path->locks[level] = 1;
+ memset(&wc->update_progress, 0,
+ sizeof(wc->update_progress));
} else {
- struct btrfs_key key;
- struct btrfs_disk_key found_key;
- struct extent_buffer *node;
-
btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+ memcpy(&wc->update_progress, &key,
+ sizeof(wc->update_progress));
+
level = root_item->drop_level;
+ BUG_ON(level == 0);
path->lowest_level = level;
- wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (wret < 0) {
- ret = wret;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ path->lowest_level = 0;
+ if (ret < 0) {
+ err = ret;
goto out;
}
- node = path->nodes[level];
- btrfs_node_key(node, &found_key, path->slots[level]);
- WARN_ON(memcmp(&found_key, &root_item->drop_progress,
- sizeof(found_key)));
+ WARN_ON(ret > 0);
+
/*
* unlock our path, this is safe because only this
* function is allowed to delete this snapshot
*/
btrfs_unlock_up_safe(path, 0);
+
+ level = btrfs_header_level(root->node);
+ while (1) {
+ btrfs_tree_lock(path->nodes[level]);
+ btrfs_set_lock_blocking(path->nodes[level]);
+
+ ret = btrfs_lookup_extent_info(trans, root,
+ path->nodes[level]->start,
+ path->nodes[level]->len,
+ &wc->refs[level],
+ &wc->flags[level]);
+ BUG_ON(ret);
+ BUG_ON(wc->refs[level] == 0);
+
+ if (level == root_item->drop_level)
+ break;
+
+ btrfs_tree_unlock(path->nodes[level]);
+ WARN_ON(wc->refs[level] != 1);
+ level--;
+ }
}
+
+ wc->level = level;
+ wc->shared_level = -1;
+ wc->stage = DROP_REFERENCE;
+ wc->update_ref = update_ref;
+ wc->keep_locks = 0;
+ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
+
while (1) {
- unsigned long update;
- wret = walk_down_tree(trans, root, path, &level);
- if (wret > 0)
+ ret = walk_down_tree(trans, root, path, wc);
+ if (ret < 0) {
+ err = ret;
break;
- if (wret < 0)
- ret = wret;
+ }
- wret = walk_up_tree(trans, root, path, &level,
- BTRFS_MAX_LEVEL);
- if (wret > 0)
+ ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
+ if (ret < 0) {
+ err = ret;
break;
- if (wret < 0)
- ret = wret;
- if (trans->transaction->in_commit ||
- trans->transaction->delayed_refs.flushing) {
- ret = -EAGAIN;
+ }
+
+ if (ret > 0) {
+ BUG_ON(wc->stage != DROP_REFERENCE);
break;
}
- for (update_count = 0; update_count < 16; update_count++) {
+
+ if (wc->stage == DROP_REFERENCE) {
+ level = wc->level;
+ btrfs_node_key(path->nodes[level],
+ &root_item->drop_progress,
+ path->slots[level]);
+ root_item->drop_level = level;
+ }
+
+ BUG_ON(wc->level == 0);
+ if (trans->transaction->in_commit ||
+ trans->transaction->delayed_refs.flushing) {
+ ret = btrfs_update_root(trans, tree_root,
+ &root->root_key,
+ root_item);
+ BUG_ON(ret);
+
+ btrfs_end_transaction(trans, tree_root);
+ trans = btrfs_start_transaction(tree_root, 1);
+ } else {
+ unsigned long update;
update = trans->delayed_ref_updates;
trans->delayed_ref_updates = 0;
if (update)
- btrfs_run_delayed_refs(trans, root, update);
- else
- break;
+ btrfs_run_delayed_refs(trans, tree_root,
+ update);
+ }
+ }
+ btrfs_release_path(root, path);
+ BUG_ON(err);
+
+ ret = btrfs_del_root(trans, tree_root, &root->root_key);
+ BUG_ON(ret);
+
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+ ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
+ NULL, NULL);
+ BUG_ON(ret < 0);
+ if (ret > 0) {
+ ret = btrfs_del_orphan_item(trans, tree_root,
+ root->root_key.objectid);
+ BUG_ON(ret);
}
}
+
+ if (root->in_radix) {
+ btrfs_free_fs_root(tree_root->fs_info, root);
+ } else {
+ free_extent_buffer(root->node);
+ free_extent_buffer(root->commit_root);
+ kfree(root);
+ }
out:
+ btrfs_end_transaction(trans, tree_root);
+ kfree(wc);
btrfs_free_path(path);
- return ret;
+ return err;
}
+/*
+ * drop subtree rooted at tree block 'node'.
+ *
+ * NOTE: this function will unlock and release tree block 'node'
+ */
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
struct extent_buffer *parent)
{
struct btrfs_path *path;
+ struct walk_control *wc;
int level;
int parent_level;
int ret = 0;
int wret;
+ BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+
path = btrfs_alloc_path();
BUG_ON(!path);
+ wc = kzalloc(sizeof(*wc), GFP_NOFS);
+ BUG_ON(!wc);
+
btrfs_assert_tree_locked(parent);
parent_level = btrfs_header_level(parent);
extent_buffer_get(parent);
@@ -4817,24 +5419,34 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
btrfs_assert_tree_locked(node);
level = btrfs_header_level(node);
- extent_buffer_get(node);
path->nodes[level] = node;
path->slots[level] = 0;
+ path->locks[level] = 1;
+
+ wc->refs[parent_level] = 1;
+ wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+ wc->level = level;
+ wc->shared_level = -1;
+ wc->stage = DROP_REFERENCE;
+ wc->update_ref = 0;
+ wc->keep_locks = 1;
+ wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
while (1) {
- wret = walk_down_tree(trans, root, path, &level);
- if (wret < 0)
+ wret = walk_down_tree(trans, root, path, wc);
+ if (wret < 0) {
ret = wret;
- if (wret != 0)
break;
+ }
- wret = walk_up_tree(trans, root, path, &level, parent_level);
+ wret = walk_up_tree(trans, root, path, wc, parent_level);
if (wret < 0)
ret = wret;
if (wret != 0)
break;
}
+ kfree(wc);
btrfs_free_path(path);
return ret;
}
@@ -4961,9 +5573,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
while (1) {
int ret;
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
@@ -6406,287 +7018,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
return 0;
}
-#if 0
-static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 objectid, u64 size)
-{
- struct btrfs_path *path;
- struct btrfs_inode_item *item;
- struct extent_buffer *leaf;
- int ret;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- path->leave_spinning = 1;
- ret = btrfs_insert_empty_inode(trans, root, path, objectid);
- if (ret)
- goto out;
-
- leaf = path->nodes[0];
- item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
- memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
- btrfs_set_inode_generation(leaf, item, 1);
- btrfs_set_inode_size(leaf, item, size);
- btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
- btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
- btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(root, path);
-out:
- btrfs_free_path(path);
- return ret;
-}
-
-static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group_cache *group)
+/*
+ * checks to see if its even possible to relocate this block group.
+ *
+ * @return - -1 if it's not a good idea to relocate this block group, 0 if its
+ * ok to go ahead and try.
+ */
+int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
{
- struct inode *inode = NULL;
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root;
- struct btrfs_key root_key;
- u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
- int err = 0;
+ struct btrfs_block_group_cache *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+ struct btrfs_device *device;
+ int full = 0;
+ int ret = 0;
- root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
- root_key.type = BTRFS_ROOT_ITEM_KEY;
- root_key.offset = (u64)-1;
- root = btrfs_read_fs_root_no_name(fs_info, &root_key);
- if (IS_ERR(root))
- return ERR_CAST(root);
+ block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
- trans = btrfs_start_transaction(root, 1);
- BUG_ON(!trans);
+ /* odd, couldn't find the block group, leave it alone */
+ if (!block_group)
+ return -1;
- err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
- if (err)
+ /* no bytes used, we're good */
+ if (!btrfs_block_group_used(&block_group->item))
goto out;
- err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
- BUG_ON(err);
-
- err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
- group->key.offset, 0, group->key.offset,
- 0, 0, 0);
- BUG_ON(err);
-
- inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
- if (inode->i_state & I_NEW) {
- BTRFS_I(inode)->root = root;
- BTRFS_I(inode)->location.objectid = objectid;
- BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
- BTRFS_I(inode)->location.offset = 0;
- btrfs_read_locked_inode(inode);
- unlock_new_inode(inode);
- BUG_ON(is_bad_inode(inode));
- } else {
- BUG_ON(1);
- }
- BTRFS_I(inode)->index_cnt = group->key.objectid;
-
- err = btrfs_orphan_add(trans, inode);
-out:
- btrfs_end_transaction(trans, root);
- if (err) {
- if (inode)
- iput(inode);
- inode = ERR_PTR(err);
- }
- return inode;
-}
-
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
-{
-
- struct btrfs_ordered_sum *sums;
- struct btrfs_sector_sum *sector_sum;
- struct btrfs_ordered_extent *ordered;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct list_head list;
- size_t offset;
- int ret;
- u64 disk_bytenr;
-
- INIT_LIST_HEAD(&list);
-
- ordered = btrfs_lookup_ordered_extent(inode, file_pos);
- BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
-
- disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
- ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
- disk_bytenr + len - 1, &list);
-
- while (!list_empty(&list)) {
- sums = list_entry(list.next, struct btrfs_ordered_sum, list);
- list_del_init(&sums->list);
-
- sector_sum = sums->sums;
- sums->bytenr = ordered->start;
+ space_info = block_group->space_info;
+ spin_lock(&space_info->lock);
- offset = 0;
- while (offset < sums->len) {
- sector_sum->bytenr += ordered->start - disk_bytenr;
- sector_sum++;
- offset += root->sectorsize;
- }
+ full = space_info->full;
- btrfs_add_ordered_sum(inode, ordered, sums);
+ /*
+ * if this is the last block group we have in this space, we can't
+ * relocate it unless we're able to allocate a new chunk below.
+ *
+ * Otherwise, we need to make sure we have room in the space to handle
+ * all of the extents from this block group. If we can, we're good
+ */
+ if ((space_info->total_bytes != block_group->key.offset) &&
+ (space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ btrfs_block_group_used(&block_group->item) <
+ space_info->total_bytes)) {
+ spin_unlock(&space_info->lock);
+ goto out;
}
- btrfs_put_ordered_extent(ordered);
- return 0;
-}
-
-int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
-{
- struct btrfs_trans_handle *trans;
- struct btrfs_path *path;
- struct btrfs_fs_info *info = root->fs_info;
- struct extent_buffer *leaf;
- struct inode *reloc_inode;
- struct btrfs_block_group_cache *block_group;
- struct btrfs_key key;
- u64 skipped;
- u64 cur_byte;
- u64 total_found;
- u32 nritems;
- int ret;
- int progress;
- int pass = 0;
-
- root = root->fs_info->extent_root;
-
- block_group = btrfs_lookup_block_group(info, group_start);
- BUG_ON(!block_group);
-
- printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
- (unsigned long long)block_group->key.objectid,
- (unsigned long long)block_group->flags);
-
- path = btrfs_alloc_path();
- BUG_ON(!path);
-
- reloc_inode = create_reloc_inode(info, block_group);
- BUG_ON(IS_ERR(reloc_inode));
-
- __alloc_chunk_for_shrink(root, block_group, 1);
- set_block_group_readonly(block_group);
-
- btrfs_start_delalloc_inodes(info->tree_root);
- btrfs_wait_ordered_extents(info->tree_root, 0);
-again:
- skipped = 0;
- total_found = 0;
- progress = 0;
- key.objectid = block_group->key.objectid;
- key.offset = 0;
- key.type = 0;
- cur_byte = key.objectid;
-
- trans = btrfs_start_transaction(info->tree_root, 1);
- btrfs_commit_transaction(trans, info->tree_root);
+ spin_unlock(&space_info->lock);
- mutex_lock(&root->fs_info->cleaner_mutex);
- btrfs_clean_old_snapshots(info->tree_root);
- btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
- mutex_unlock(&root->fs_info->cleaner_mutex);
+ /*
+ * ok we don't have enough space, but maybe we have free space on our
+ * devices to allocate new chunks for relocation, so loop through our
+ * alloc devices and guess if we have enough space. However, if we
+ * were marked as full, then we know there aren't enough chunks, and we
+ * can just return.
+ */
+ ret = -1;
+ if (full)
+ goto out;
- trans = btrfs_start_transaction(info->tree_root, 1);
- btrfs_commit_transaction(trans, info->tree_root);
+ mutex_lock(&root->fs_info->chunk_mutex);
+ list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+ u64 min_free = btrfs_block_group_used(&block_group->item);
+ u64 dev_offset, max_avail;
- while (1) {
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
-next:
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
- if (path->slots[0] >= nritems) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0)
- goto out;
- if (ret == 1) {
- ret = 0;
+ /*
+ * check to make sure we can actually find a chunk with enough
+ * space to fit our block group in.
+ */
+ if (device->total_bytes > device->bytes_used + min_free) {
+ ret = find_free_dev_extent(NULL, device, min_free,
+ &dev_offset, &max_avail);
+ if (!ret)
break;
- }
- leaf = path->nodes[0];
- nritems = btrfs_header_nritems(leaf);
+ ret = -1;
}
-
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-
- if (key.objectid >= block_group->key.objectid +
- block_group->key.offset)
- break;
-
- if (progress && need_resched()) {
- btrfs_release_path(root, path);
- cond_resched();
- progress = 0;
- continue;
- }
- progress = 1;
-
- if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
- key.objectid + key.offset <= cur_byte) {
- path->slots[0]++;
- goto next;
- }
-
- total_found++;
- cur_byte = key.objectid + key.offset;
- btrfs_release_path(root, path);
-
- __alloc_chunk_for_shrink(root, block_group, 0);
- ret = relocate_one_extent(root, path, &key, block_group,
- reloc_inode, pass);
- BUG_ON(ret < 0);
- if (ret > 0)
- skipped++;
-
- key.objectid = cur_byte;
- key.type = 0;
- key.offset = 0;
}
-
- btrfs_release_path(root, path);
-
- if (pass == 0) {
- btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
- invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
- }
-
- if (total_found > 0) {
- printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
- (unsigned long long)total_found, pass);
- pass++;
- if (total_found == skipped && pass > 2) {
- iput(reloc_inode);
- reloc_inode = create_reloc_inode(info, block_group);
- pass = 0;
- }
- goto again;
- }
-
- /* delete reloc_inode */
- iput(reloc_inode);
-
- /* unpin extents in this range */
- trans = btrfs_start_transaction(info->tree_root, 1);
- btrfs_commit_transaction(trans, info->tree_root);
-
- spin_lock(&block_group->lock);
- WARN_ON(block_group->pinned > 0);
- WARN_ON(block_group->reserved > 0);
- WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
- spin_unlock(&block_group->lock);
- btrfs_put_block_group(block_group);
- ret = 0;
+ mutex_unlock(&root->fs_info->chunk_mutex);
out:
- btrfs_free_path(path);
+ btrfs_put_block_group(block_group);
return ret;
}
-#endif
static int find_first_block_group(struct btrfs_root *root,
struct btrfs_path *path, struct btrfs_key *key)
@@ -6729,8 +7140,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
{
struct btrfs_block_group_cache *block_group;
struct btrfs_space_info *space_info;
+ struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
+ down_write(&info->extent_commit_sem);
+ while (!list_empty(&info->caching_block_groups)) {
+ caching_ctl = list_entry(info->caching_block_groups.next,
+ struct btrfs_caching_control, list);
+ list_del(&caching_ctl->list);
+ put_caching_control(caching_ctl);
+ }
+ up_write(&info->extent_commit_sem);
+
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -6739,11 +7160,15 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
&info->block_group_cache_tree);
spin_unlock(&info->block_group_cache_lock);
- btrfs_remove_free_space_cache(block_group);
down_write(&block_group->space_info->groups_sem);
list_del(&block_group->list);
up_write(&block_group->space_info->groups_sem);
+ if (block_group->cached == BTRFS_CACHE_STARTED)
+ wait_block_group_cache_done(block_group);
+
+ btrfs_remove_free_space_cache(block_group);
+
WARN_ON(atomic_read(&block_group->count) != 1);
kfree(block_group);
@@ -6809,9 +7234,18 @@ int btrfs_read_block_groups(struct btrfs_root *root)
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
spin_lock_init(&cache->tree_lock);
- mutex_init(&cache->cache_mutex);
+ cache->fs_info = info;
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
+
+ /*
+ * we only want to have 32k of ram per block group for keeping
+ * track of free space, and if we pass 1/2 of that we want to
+ * start converting things over to using bitmaps
+ */
+ cache->extents_thresh = ((1024 * 32) / 2) /
+ sizeof(struct btrfs_free_space);
+
read_extent_buffer(leaf, &cache->item,
btrfs_item_ptr_offset(leaf, path->slots[0]),
sizeof(cache->item));
@@ -6820,12 +7254,40 @@ int btrfs_read_block_groups(struct btrfs_root *root)
key.objectid = found_key.objectid + found_key.offset;
btrfs_release_path(root, path);
cache->flags = btrfs_block_group_flags(&cache->item);
+ cache->sectorsize = root->sectorsize;
+
+ /*
+ * check for two cases, either we are full, and therefore
+ * don't need to bother with the caching work since we won't
+ * find any space, or we are empty, and we can just add all
+ * the space in and be done with it. This saves us _alot_ of
+ * time, particularly in the full case.
+ */
+ if (found_key.offset == btrfs_block_group_used(&cache->item)) {
+ exclude_super_stripes(root, cache);
+ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ free_excluded_extents(root, cache);
+ } else if (btrfs_block_group_used(&cache->item) == 0) {
+ exclude_super_stripes(root, cache);
+ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ add_new_free_space(cache, root->fs_info,
+ found_key.objectid,
+ found_key.objectid +
+ found_key.offset);
+ free_excluded_extents(root, cache);
+ }
ret = update_space_info(info, cache->flags, found_key.offset,
btrfs_block_group_used(&cache->item),
&space_info);
BUG_ON(ret);
cache->space_info = space_info;
+ spin_lock(&cache->space_info->lock);
+ cache->space_info->bytes_super += cache->bytes_super;
+ spin_unlock(&cache->space_info->lock);
+
down_write(&space_info->groups_sem);
list_add_tail(&cache->list, &space_info->block_groups);
up_write(&space_info->groups_sem);
@@ -6863,10 +7325,18 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->key.objectid = chunk_offset;
cache->key.offset = size;
cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ cache->sectorsize = root->sectorsize;
+
+ /*
+ * we only want to have 32k of ram per block group for keeping track
+ * of free space, and if we pass 1/2 of that we want to start
+ * converting things over to using bitmaps
+ */
+ cache->extents_thresh = ((1024 * 32) / 2) /
+ sizeof(struct btrfs_free_space);
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
spin_lock_init(&cache->tree_lock);
- mutex_init(&cache->cache_mutex);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
@@ -6875,9 +7345,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->flags = type;
btrfs_set_block_group_flags(&cache->item, type);
+ cache->last_byte_to_unpin = (u64)-1;
+ cache->cached = BTRFS_CACHE_FINISHED;
+ exclude_super_stripes(root, cache);
+
+ add_new_free_space(cache, root->fs_info, chunk_offset,
+ chunk_offset + size);
+
+ free_excluded_extents(root, cache);
+
ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
&cache->space_info);
BUG_ON(ret);
+
+ spin_lock(&cache->space_info->lock);
+ cache->space_info->bytes_super += cache->bytes_super;
+ spin_unlock(&cache->space_info->lock);
+
down_write(&cache->space_info->groups_sem);
list_add_tail(&cache->list, &cache->space_info->block_groups);
up_write(&cache->space_info->groups_sem);
@@ -6933,7 +7417,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
rb_erase(&block_group->cache_node,
&root->fs_info->block_group_cache_tree);
spin_unlock(&root->fs_info->block_group_cache_lock);
- btrfs_remove_free_space_cache(block_group);
+
down_write(&block_group->space_info->groups_sem);
/*
* we must use list_del_init so people can check to see if they
@@ -6942,11 +7426,17 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
list_del_init(&block_group->list);
up_write(&block_group->space_info->groups_sem);
+ if (block_group->cached == BTRFS_CACHE_STARTED)
+ wait_block_group_cache_done(block_group);
+
+ btrfs_remove_free_space_cache(block_group);
+
spin_lock(&block_group->space_info->lock);
block_group->space_info->total_bytes -= block_group->key.offset;
block_group->space_info->bytes_readonly -= block_group->key.offset;
spin_unlock(&block_group->space_info->lock);
- block_group->space_info->full = 0;
+
+ btrfs_clear_space_info_full(root->fs_info);
btrfs_put_block_group(block_group);
btrfs_put_block_group(block_group);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 68260180f587..de1793ba004a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
return NULL;
}
+static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
+ struct extent_state *other)
+{
+ if (tree->ops && tree->ops->merge_extent_hook)
+ tree->ops->merge_extent_hook(tree->mapping->host, new,
+ other);
+}
+
/*
* utility function to look for merge candidates inside a given range.
* Any extents with matching state are merged together into a single
@@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree,
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->end == state->start - 1 &&
other->state == state->state) {
+ merge_cb(tree, state, other);
state->start = other->start;
other->tree = NULL;
rb_erase(&other->rb_node, &tree->state);
@@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree,
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->start == state->end + 1 &&
other->state == state->state) {
+ merge_cb(tree, state, other);
other->start = state->start;
state->tree = NULL;
rb_erase(&state->rb_node, &tree->state);
free_extent_state(state);
+ state = NULL;
}
}
+
return 0;
}
-static void set_state_cb(struct extent_io_tree *tree,
+static int set_state_cb(struct extent_io_tree *tree,
struct extent_state *state,
unsigned long bits)
{
if (tree->ops && tree->ops->set_bit_hook) {
- tree->ops->set_bit_hook(tree->mapping->host, state->start,
- state->end, state->state, bits);
+ return tree->ops->set_bit_hook(tree->mapping->host,
+ state->start, state->end,
+ state->state, bits);
}
+
+ return 0;
}
static void clear_state_cb(struct extent_io_tree *tree,
struct extent_state *state,
unsigned long bits)
{
- if (tree->ops && tree->ops->clear_bit_hook) {
- tree->ops->clear_bit_hook(tree->mapping->host, state->start,
- state->end, state->state, bits);
- }
+ if (tree->ops && tree->ops->clear_bit_hook)
+ tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
}
/*
@@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree,
int bits)
{
struct rb_node *node;
+ int ret;
if (end < start) {
printk(KERN_ERR "btrfs end < start %llu %llu\n",
@@ -365,12 +379,15 @@ static int insert_state(struct extent_io_tree *tree,
(unsigned long long)start);
WARN_ON(1);
}
+ state->start = start;
+ state->end = end;
+ ret = set_state_cb(tree, state, bits);
+ if (ret)
+ return ret;
+
if (bits & EXTENT_DIRTY)
tree->dirty_bytes += end - start + 1;
- set_state_cb(tree, state, bits);
state->state |= bits;
- state->start = start;
- state->end = end;
node = tree_insert(&tree->state, end, &state->rb_node);
if (node) {
struct extent_state *found;
@@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree,
return 0;
}
+static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
+ u64 split)
+{
+ if (tree->ops && tree->ops->split_extent_hook)
+ return tree->ops->split_extent_hook(tree->mapping->host,
+ orig, split);
+ return 0;
+}
+
/*
* split a given extent state struct in two, inserting the preallocated
* struct 'prealloc' as the newly created second half. 'split' indicates an
@@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct extent_state *prealloc, u64 split)
{
struct rb_node *node;
+
+ split_cb(tree, orig, split);
+
prealloc->start = orig->start;
prealloc->end = split - 1;
prealloc->state = orig->state;
@@ -471,10 +500,14 @@ static int clear_state_bit(struct extent_io_tree *tree,
* bits were already set, or zero if none of the bits were already set.
*/
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int wake, int delete, gfp_t mask)
+ int bits, int wake, int delete,
+ struct extent_state **cached_state,
+ gfp_t mask)
{
struct extent_state *state;
+ struct extent_state *cached;
struct extent_state *prealloc = NULL;
+ struct rb_node *next_node;
struct rb_node *node;
u64 last_end;
int err;
@@ -488,6 +521,17 @@ again:
}
spin_lock(&tree->lock);
+ if (cached_state) {
+ cached = *cached_state;
+ *cached_state = NULL;
+ cached_state = NULL;
+ if (cached && cached->tree && cached->start == start) {
+ atomic_dec(&cached->refs);
+ state = cached;
+ goto hit_next;
+ }
+ free_extent_state(cached);
+ }
/*
* this search will find the extents that end after
* our range starts
@@ -496,6 +540,7 @@ again:
if (!node)
goto out;
state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
if (state->start > end)
goto out;
WARN_ON(state->end < start);
@@ -526,13 +571,11 @@ again:
if (err)
goto out;
if (state->end <= end) {
- set |= clear_state_bit(tree, state, bits,
- wake, delete);
+ set |= clear_state_bit(tree, state, bits, wake,
+ delete);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
- } else {
- start = state->start;
}
goto search_again;
}
@@ -547,19 +590,30 @@ again:
prealloc = alloc_extent_state(GFP_ATOMIC);
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
-
if (wake)
wake_up(&state->wq);
- set |= clear_state_bit(tree, prealloc, bits,
- wake, delete);
+
+ set |= clear_state_bit(tree, prealloc, bits, wake, delete);
+
prealloc = NULL;
goto out;
}
+ if (state->end < end && prealloc && !need_resched())
+ next_node = rb_next(&state->rb_node);
+ else
+ next_node = NULL;
+
set |= clear_state_bit(tree, state, bits, wake, delete);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
+ if (start <= end && next_node) {
+ state = rb_entry(next_node, struct extent_state,
+ rb_node);
+ if (state->start == start)
+ goto hit_next;
+ }
goto search_again;
out:
@@ -641,40 +695,59 @@ out:
return 0;
}
-static void set_state_bits(struct extent_io_tree *tree,
+static int set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
int bits)
{
+ int ret;
+
+ ret = set_state_cb(tree, state, bits);
+ if (ret)
+ return ret;
+
if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
}
- set_state_cb(tree, state, bits);
state->state |= bits;
+
+ return 0;
+}
+
+static void cache_state(struct extent_state *state,
+ struct extent_state **cached_ptr)
+{
+ if (cached_ptr && !(*cached_ptr)) {
+ if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
+ *cached_ptr = state;
+ atomic_inc(&state->refs);
+ }
+ }
}
/*
- * set some bits on a range in the tree. This may require allocations
- * or sleeping, so the gfp mask is used to indicate what is allowed.
+ * set some bits on a range in the tree. This may require allocations or
+ * sleeping, so the gfp mask is used to indicate what is allowed.
*
- * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
- * range already has the desired bits set. The start of the existing
- * range is returned in failed_start in this case.
+ * If any of the exclusive bits are set, this will fail with -EEXIST if some
+ * part of the range already has the desired bits set. The start of the
+ * existing range is returned in failed_start in this case.
*
- * [start, end] is inclusive
- * This takes the tree lock.
+ * [start, end] is inclusive This takes the tree lock.
*/
+
static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int exclusive, u64 *failed_start,
+ int bits, int exclusive_bits, u64 *failed_start,
+ struct extent_state **cached_state,
gfp_t mask)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
struct rb_node *node;
int err = 0;
- int set;
u64 last_start;
u64 last_end;
+
again:
if (!prealloc && (mask & __GFP_WAIT)) {
prealloc = alloc_extent_state(mask);
@@ -683,6 +756,13 @@ again:
}
spin_lock(&tree->lock);
+ if (cached_state && *cached_state) {
+ state = *cached_state;
+ if (state->start == start && state->tree) {
+ node = &state->rb_node;
+ goto hit_next;
+ }
+ }
/*
* this search will find all the extents that end after
* our range starts.
@@ -694,8 +774,8 @@ again:
BUG_ON(err == -EEXIST);
goto out;
}
-
state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
last_start = state->start;
last_end = state->end;
@@ -706,17 +786,32 @@ again:
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) {
- set = state->state & bits;
- if (set && exclusive) {
+ struct rb_node *next_node;
+ if (state->state & exclusive_bits) {
*failed_start = state->start;
err = -EEXIST;
goto out;
}
- set_state_bits(tree, state, bits);
+
+ err = set_state_bits(tree, state, bits);
+ if (err)
+ goto out;
+
+ cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
+
start = last_end + 1;
+ if (start < end && prealloc && !need_resched()) {
+ next_node = rb_next(node);
+ if (next_node) {
+ state = rb_entry(next_node, struct extent_state,
+ rb_node);
+ if (state->start == start)
+ goto hit_next;
+ }
+ }
goto search_again;
}
@@ -737,8 +832,7 @@ again:
* desired bit on it.
*/
if (state->start < start) {
- set = state->state & bits;
- if (exclusive && set) {
+ if (state->state & exclusive_bits) {
*failed_start = start;
err = -EEXIST;
goto out;
@@ -749,13 +843,14 @@ again:
if (err)
goto out;
if (state->end <= end) {
- set_state_bits(tree, state, bits);
+ err = set_state_bits(tree, state, bits);
+ if (err)
+ goto out;
+ cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
- } else {
- start = state->start;
}
goto search_again;
}
@@ -774,10 +869,13 @@ again:
this_end = last_start - 1;
err = insert_state(tree, prealloc, start, this_end,
bits);
- prealloc = NULL;
BUG_ON(err == -EEXIST);
- if (err)
+ if (err) {
+ prealloc = NULL;
goto out;
+ }
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
start = this_end + 1;
goto search_again;
}
@@ -788,8 +886,7 @@ again:
* on the first half
*/
if (state->start <= end && state->end > end) {
- set = state->state & bits;
- if (exclusive && set) {
+ if (state->state & exclusive_bits) {
*failed_start = start;
err = -EEXIST;
goto out;
@@ -797,7 +894,12 @@ again:
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
- set_state_bits(tree, prealloc, bits);
+ err = set_state_bits(tree, prealloc, bits);
+ if (err) {
+ prealloc = NULL;
+ goto out;
+ }
+ cache_state(prealloc, cached_state);
merge_state(tree, prealloc);
prealloc = NULL;
goto out;
@@ -826,86 +928,64 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
- mask);
-}
-
-int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
+ NULL, mask);
}
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask)
{
return set_extent_bit(tree, start, end, bits, 0, NULL,
- mask);
+ NULL, mask);
}
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask)
{
- return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
+ return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
}
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return set_extent_bit(tree, start, end,
- EXTENT_DELALLOC | EXTENT_DIRTY,
- 0, NULL, mask);
+ EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
+ 0, NULL, NULL, mask);
}
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return clear_extent_bit(tree, start, end,
- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
-}
-
-int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
+ NULL, mask);
}
int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
- mask);
+ NULL, mask);
}
static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
- return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
+ return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
+ NULL, mask);
}
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
- mask);
+ NULL, mask);
}
static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, gfp_t mask)
{
- return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
-}
-
-static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
- gfp_t mask)
-{
- return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
- 0, NULL, mask);
-}
-
-static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
- u64 end, gfp_t mask)
-{
- return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
+ return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+ NULL, mask);
}
int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -917,13 +997,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
* either insert or lock state struct between start and end use mask to tell
* us if waiting is desired.
*/
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, struct extent_state **cached_state, gfp_t mask)
{
int err;
u64 failed_start;
while (1) {
- err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
- &failed_start, mask);
+ err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+ EXTENT_LOCKED, &failed_start,
+ cached_state, mask);
if (err == -EEXIST && (mask & __GFP_WAIT)) {
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
start = failed_start;
@@ -935,27 +1017,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
return err;
}
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+{
+ return lock_extent_bits(tree, start, end, 0, NULL, mask);
+}
+
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
int err;
u64 failed_start;
- err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
- &failed_start, mask);
+ err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+ &failed_start, NULL, mask);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
- EXTENT_LOCKED, 1, 0, mask);
+ EXTENT_LOCKED, 1, 0, NULL, mask);
return 0;
}
return 1;
}
+int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached, gfp_t mask)
+{
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+ mask);
+}
+
int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
- return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
+ return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
+ mask);
}
/*
@@ -974,7 +1069,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
page_cache_release(page);
index++;
}
- set_extent_dirty(tree, start, end, GFP_NOFS);
return 0;
}
@@ -994,7 +1088,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
page_cache_release(page);
index++;
}
- set_extent_writeback(tree, start, end, GFP_NOFS);
return 0;
}
@@ -1232,6 +1325,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,
u64 delalloc_start;
u64 delalloc_end;
u64 found;
+ struct extent_state *cached_state = NULL;
int ret;
int loops = 0;
@@ -1269,6 +1363,7 @@ again:
/* some of the pages are gone, lets avoid looping by
* shortening the size of the delalloc range we're searching
*/
+ free_extent_state(cached_state);
if (!loops) {
unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
max_bytes = PAGE_CACHE_SIZE - offset;
@@ -1282,18 +1377,21 @@ again:
BUG_ON(ret);
/* step three, lock the state bits for the whole range */
- lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+ lock_extent_bits(tree, delalloc_start, delalloc_end,
+ 0, &cached_state, GFP_NOFS);
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
- EXTENT_DELALLOC, 1);
+ EXTENT_DELALLOC, 1, cached_state);
if (!ret) {
- unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
+ unlock_extent_cached(tree, delalloc_start, delalloc_end,
+ &cached_state, GFP_NOFS);
__unlock_for_delalloc(inode, locked_page,
delalloc_start, delalloc_end);
cond_resched();
goto again;
}
+ free_extent_state(cached_state);
*start = delalloc_start;
*end = delalloc_end;
out_failed:
@@ -1307,7 +1405,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
int clear_unlock,
int clear_delalloc, int clear_dirty,
int set_writeback,
- int end_writeback)
+ int end_writeback,
+ int set_private2)
{
int ret;
struct page *pages[16];
@@ -1325,8 +1424,9 @@ int extent_clear_unlock_delalloc(struct inode *inode,
if (clear_delalloc)
clear_bits |= EXTENT_DELALLOC;
- clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
- if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
+ clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
+ if (!(unlock_pages || clear_dirty || set_writeback || end_writeback ||
+ set_private2))
return 0;
while (nr_pages > 0) {
@@ -1334,6 +1434,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
min_t(unsigned long,
nr_pages, ARRAY_SIZE(pages)), pages);
for (i = 0; i < ret; i++) {
+
+ if (set_private2)
+ SetPagePrivate2(pages[i]);
+
if (pages[i] == locked_page) {
page_cache_release(pages[i]);
continue;
@@ -1476,14 +1580,17 @@ out:
* range is found set.
*/
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int filled)
+ int bits, int filled, struct extent_state *cached)
{
struct extent_state *state = NULL;
struct rb_node *node;
int bitset = 0;
spin_lock(&tree->lock);
- node = tree_search(tree, start);
+ if (cached && cached->tree && cached->start == start)
+ node = &cached->rb_node;
+ else
+ node = tree_search(tree, start);
while (node && start <= end) {
state = rb_entry(node, struct extent_state, rb_node);
@@ -1503,6 +1610,10 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
bitset = 0;
break;
}
+
+ if (state->end == (u64)-1)
+ break;
+
start = state->end + 1;
if (start > end)
break;
@@ -1526,7 +1637,7 @@ static int check_page_uptodate(struct extent_io_tree *tree,
{
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 end = start + PAGE_CACHE_SIZE - 1;
- if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
+ if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
SetPageUptodate(page);
return 0;
}
@@ -1540,7 +1651,7 @@ static int check_page_locked(struct extent_io_tree *tree,
{
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 end = start + PAGE_CACHE_SIZE - 1;
- if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
+ if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
unlock_page(page);
return 0;
}
@@ -1552,10 +1663,7 @@ static int check_page_locked(struct extent_io_tree *tree,
static int check_page_writeback(struct extent_io_tree *tree,
struct page *page)
{
- u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
- u64 end = start + PAGE_CACHE_SIZE - 1;
- if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
- end_page_writeback(page);
+ end_page_writeback(page);
return 0;
}
@@ -1613,13 +1721,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
}
if (!uptodate) {
- clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
+ clear_extent_uptodate(tree, start, end, GFP_NOFS);
ClearPageUptodate(page);
SetPageError(page);
}
- clear_extent_writeback(tree, start, end, GFP_ATOMIC);
-
if (whole_page)
end_page_writeback(page);
else
@@ -1983,7 +2089,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
continue;
}
/* the get_extent function already copied into the page */
- if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
+ if (test_range_bit(tree, cur, cur_end,
+ EXTENT_UPTODATE, 1, NULL)) {
check_page_uptodate(tree, page);
unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
cur = cur + iosize;
@@ -2078,6 +2185,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
u64 iosize;
u64 unlock_start;
sector_t sector;
+ struct extent_state *cached_state = NULL;
struct extent_map *em;
struct block_device *bdev;
int ret;
@@ -2124,6 +2232,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
delalloc_end = 0;
page_started = 0;
if (!epd->extent_locked) {
+ u64 delalloc_to_write = 0;
/*
* make sure the wbc mapping index is at least updated
* to this page.
@@ -2143,8 +2252,24 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
tree->ops->fill_delalloc(inode, page, delalloc_start,
delalloc_end, &page_started,
&nr_written);
+ /*
+ * delalloc_end is already one less than the total
+ * length, so we don't subtract one from
+ * PAGE_CACHE_SIZE
+ */
+ delalloc_to_write += (delalloc_end - delalloc_start +
+ PAGE_CACHE_SIZE) >>
+ PAGE_CACHE_SHIFT;
delalloc_start = delalloc_end + 1;
}
+ if (wbc->nr_to_write < delalloc_to_write) {
+ int thresh = 8192;
+
+ if (delalloc_to_write < thresh * 2)
+ thresh = delalloc_to_write;
+ wbc->nr_to_write = min_t(u64, delalloc_to_write,
+ thresh);
+ }
/* did the fill delalloc function already unlock and start
* the IO?
@@ -2160,15 +2285,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
goto done_unlocked;
}
}
- lock_extent(tree, start, page_end, GFP_NOFS);
-
- unlock_start = start;
-
if (tree->ops && tree->ops->writepage_start_hook) {
ret = tree->ops->writepage_start_hook(page, start,
page_end);
if (ret == -EAGAIN) {
- unlock_extent(tree, start, page_end, GFP_NOFS);
redirty_page_for_writepage(wbc, page);
update_nr_written(page, wbc, nr_written);
unlock_page(page);
@@ -2184,12 +2304,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
update_nr_written(page, wbc, nr_written + 1);
end = page_end;
- if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
- printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
-
if (last_byte <= start) {
- clear_extent_dirty(tree, start, page_end, GFP_NOFS);
- unlock_extent(tree, start, page_end, GFP_NOFS);
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, start,
page_end, NULL, 1);
@@ -2197,13 +2312,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
goto done;
}
- set_extent_uptodate(tree, start, page_end, GFP_NOFS);
blocksize = inode->i_sb->s_blocksize;
while (cur <= end) {
if (cur >= last_byte) {
- clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
- unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, cur,
page_end, NULL, 1);
@@ -2235,12 +2347,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
*/
if (compressed || block_start == EXTENT_MAP_HOLE ||
block_start == EXTENT_MAP_INLINE) {
- clear_extent_dirty(tree, cur,
- cur + iosize - 1, GFP_NOFS);
-
- unlock_extent(tree, unlock_start, cur + iosize - 1,
- GFP_NOFS);
-
/*
* end_io notification does not happen here for
* compressed extents
@@ -2265,13 +2371,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
}
/* leave this out until we have a page_mkwrite call */
if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
- EXTENT_DIRTY, 0)) {
+ EXTENT_DIRTY, 0, NULL)) {
cur = cur + iosize;
pg_offset += iosize;
continue;
}
- clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
if (tree->ops && tree->ops->writepage_io_hook) {
ret = tree->ops->writepage_io_hook(page, cur,
cur + iosize - 1);
@@ -2309,12 +2414,12 @@ done:
set_page_writeback(page);
end_page_writeback(page);
}
- if (unlock_start <= page_end)
- unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
unlock_page(page);
done_unlocked:
+ /* drop our reference on any cached states */
+ free_extent_state(cached_state);
return 0;
}
@@ -2339,9 +2444,9 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
writepage_t writepage, void *data,
void (*flush_fn)(void *))
{
- struct backing_dev_info *bdi = mapping->backing_dev_info;
int ret = 0;
int done = 0;
+ int nr_to_write_done = 0;
struct pagevec pvec;
int nr_pages;
pgoff_t index;
@@ -2361,7 +2466,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
scanned = 1;
}
retry:
- while (!done && (index <= end) &&
+ while (!done && !nr_to_write_done && (index <= end) &&
(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
PAGECACHE_TAG_DIRTY, min(end - index,
(pgoff_t)PAGEVEC_SIZE-1) + 1))) {
@@ -2412,12 +2517,15 @@ retry:
unlock_page(page);
ret = 0;
}
- if (ret || wbc->nr_to_write <= 0)
- done = 1;
- if (wbc->nonblocking && bdi_write_congested(bdi)) {
- wbc->encountered_congestion = 1;
+ if (ret)
done = 1;
- }
+
+ /*
+ * the filesystem may choose to bump up nr_to_write.
+ * We have to make sure to honor the new nr_to_write
+ * at any time
+ */
+ nr_to_write_done = wbc->nr_to_write <= 0;
}
pagevec_release(&pvec);
cond_resched();
@@ -2604,10 +2712,10 @@ int extent_invalidatepage(struct extent_io_tree *tree,
return 0;
lock_extent(tree, start, end, GFP_NOFS);
- wait_on_extent_writeback(tree, start, end);
+ wait_on_page_writeback(page);
clear_extent_bit(tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
- 1, 1, GFP_NOFS);
+ 1, 1, NULL, GFP_NOFS);
return 0;
}
@@ -2687,7 +2795,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
!isnew && !PageUptodate(page) &&
(block_off_end > to || block_off_start < from) &&
!test_range_bit(tree, block_start, cur_end,
- EXTENT_UPTODATE, 1)) {
+ EXTENT_UPTODATE, 1, NULL)) {
u64 sector;
u64 extent_offset = block_start - em->start;
size_t iosize;
@@ -2701,7 +2809,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
*/
set_extent_bit(tree, block_start,
block_start + iosize - 1,
- EXTENT_LOCKED, 0, NULL, GFP_NOFS);
+ EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
ret = submit_extent_page(READ, tree, page,
sector, iosize, page_offset, em->bdev,
NULL, 1,
@@ -2742,13 +2850,18 @@ int try_release_extent_state(struct extent_map_tree *map,
int ret = 1;
if (test_range_bit(tree, start, end,
- EXTENT_IOBITS | EXTENT_ORDERED, 0))
+ EXTENT_IOBITS, 0, NULL))
ret = 0;
else {
if ((mask & GFP_NOFS) == GFP_NOFS)
mask = GFP_NOFS;
- clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
- 1, 1, mask);
+ /*
+ * at this point we can safely clear everything except the
+ * locked bit and the nodatasum bit
+ */
+ clear_extent_bit(tree, start, end,
+ ~(EXTENT_LOCKED | EXTENT_NODATASUM),
+ 0, 0, NULL, mask);
}
return ret;
}
@@ -2771,29 +2884,28 @@ int try_release_extent_mapping(struct extent_map_tree *map,
u64 len;
while (start <= end) {
len = end - start + 1;
- spin_lock(&map->lock);
+ write_lock(&map->lock);
em = lookup_extent_mapping(map, start, len);
if (!em || IS_ERR(em)) {
- spin_unlock(&map->lock);
+ write_unlock(&map->lock);
break;
}
if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
em->start != start) {
- spin_unlock(&map->lock);
+ write_unlock(&map->lock);
free_extent_map(em);
break;
}
if (!test_range_bit(tree, em->start,
extent_map_end(em) - 1,
- EXTENT_LOCKED | EXTENT_WRITEBACK |
- EXTENT_ORDERED,
- 0)) {
+ EXTENT_LOCKED | EXTENT_WRITEBACK,
+ 0, NULL)) {
remove_extent_mapping(map, em);
/* once for the rb tree */
free_extent_map(em);
}
start = extent_map_end(em);
- spin_unlock(&map->lock);
+ write_unlock(&map->lock);
/* once for us */
free_extent_map(em);
@@ -3203,7 +3315,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
int uptodate;
unsigned long index;
- ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
+ ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
if (ret)
return 1;
while (start <= end) {
@@ -3233,7 +3345,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
return 1;
ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
- EXTENT_UPTODATE, 1);
+ EXTENT_UPTODATE, 1, NULL);
if (ret)
return ret;
@@ -3269,7 +3381,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
return 0;
if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
- EXTENT_UPTODATE, 1)) {
+ EXTENT_UPTODATE, 1, NULL)) {
return 0;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5bc20abf3f3d..4794ec891fed 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -13,10 +13,8 @@
#define EXTENT_DEFRAG (1 << 6)
#define EXTENT_DEFRAG_DONE (1 << 7)
#define EXTENT_BUFFER_FILLED (1 << 8)
-#define EXTENT_ORDERED (1 << 9)
-#define EXTENT_ORDERED_METADATA (1 << 10)
-#define EXTENT_BOUNDARY (1 << 11)
-#define EXTENT_NODATASUM (1 << 12)
+#define EXTENT_BOUNDARY (1 << 9)
+#define EXTENT_NODATASUM (1 << 10)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
/* flags for bio submission */
@@ -62,8 +60,13 @@ struct extent_io_ops {
struct extent_state *state, int uptodate);
int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
unsigned long old, unsigned long bits);
- int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
- unsigned long old, unsigned long bits);
+ int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
+ unsigned long bits);
+ int (*merge_extent_hook)(struct inode *inode,
+ struct extent_state *new,
+ struct extent_state *other);
+ int (*split_extent_hook)(struct inode *inode,
+ struct extent_state *orig, u64 split);
int (*write_cache_pages_lock_hook)(struct page *page);
};
@@ -81,10 +84,14 @@ struct extent_state {
u64 start;
u64 end; /* inclusive */
struct rb_node rb_node;
+
+ /* ADD NEW ELEMENTS AFTER THIS */
struct extent_io_tree *tree;
wait_queue_head_t wq;
atomic_t refs;
unsigned long state;
+ u64 split_start;
+ u64 split_end;
/* for use by the FS */
u64 private;
@@ -142,6 +149,8 @@ int try_release_extent_state(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask);
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ int bits, struct extent_state **cached, gfp_t mask);
int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
@@ -155,11 +164,12 @@ u64 count_range_bits(struct extent_io_tree *tree,
u64 max_bytes, unsigned long bits);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int filled);
+ int bits, int filled, struct extent_state *cached_state);
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- int bits, int wake, int delete, gfp_t mask);
+ int bits, int wake, int delete, struct extent_state **cached,
+ gfp_t mask);
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask);
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
@@ -282,5 +292,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
int clear_unlock,
int clear_delalloc, int clear_dirty,
int set_writeback,
- int end_writeback);
+ int end_writeback,
+ int set_private2);
#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 30c9365861e6..2c726b7b9faa 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -36,7 +36,7 @@ void extent_map_exit(void)
void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
{
tree->map.rb_node = NULL;
- spin_lock_init(&tree->lock);
+ rwlock_init(&tree->lock);
}
/**
@@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
return 0;
}
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+{
+ int ret = 0;
+ struct extent_map *merge = NULL;
+ struct rb_node *rb;
+ struct extent_map *em;
+
+ write_lock(&tree->lock);
+ em = lookup_extent_mapping(tree, start, len);
+
+ WARN_ON(em->start != start || !em);
+
+ if (!em)
+ goto out;
+
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ if (em->start != 0) {
+ rb = rb_prev(&em->rb_node);
+ if (rb)
+ merge = rb_entry(rb, struct extent_map, rb_node);
+ if (rb && mergable_maps(merge, em)) {
+ em->start = merge->start;
+ em->len += merge->len;
+ em->block_len += merge->block_len;
+ em->block_start = merge->block_start;
+ merge->in_tree = 0;
+ rb_erase(&merge->rb_node, &tree->map);
+ free_extent_map(merge);
+ }
+ }
+
+ rb = rb_next(&em->rb_node);
+ if (rb)
+ merge = rb_entry(rb, struct extent_map, rb_node);
+ if (rb && mergable_maps(em, merge)) {
+ em->len += merge->len;
+ em->block_len += merge->len;
+ rb_erase(&merge->rb_node, &tree->map);
+ merge->in_tree = 0;
+ free_extent_map(merge);
+ }
+
+ free_extent_map(em);
+out:
+ write_unlock(&tree->lock);
+ return ret;
+
+}
+
/**
* add_extent_mapping - add new extent map to the extent tree
* @tree: tree to insert new map in
@@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
ret = -EEXIST;
goto out;
}
- assert_spin_locked(&tree->lock);
rb = tree_insert(&tree->map, em->start, &em->rb_node);
if (rb) {
ret = -EEXIST;
@@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
struct rb_node *next = NULL;
u64 end = range_end(start, len);
- assert_spin_locked(&tree->lock);
rb_node = __tree_search(&tree->map, start, &prev, &next);
if (!rb_node && prev) {
em = rb_entry(prev, struct extent_map, rb_node);
@@ -319,6 +367,54 @@ out:
}
/**
+ * search_extent_mapping - find a nearby extent map
+ * @tree: tree to lookup in
+ * @start: byte offset to start the search
+ * @len: length of the lookup range
+ *
+ * Find and return the first extent_map struct in @tree that intersects the
+ * [start, len] range.
+ *
+ * If one can't be found, any nearby extent may be returned
+ */
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len)
+{
+ struct extent_map *em;
+ struct rb_node *rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *next = NULL;
+
+ rb_node = __tree_search(&tree->map, start, &prev, &next);
+ if (!rb_node && prev) {
+ em = rb_entry(prev, struct extent_map, rb_node);
+ goto found;
+ }
+ if (!rb_node && next) {
+ em = rb_entry(next, struct extent_map, rb_node);
+ goto found;
+ }
+ if (!rb_node) {
+ em = NULL;
+ goto out;
+ }
+ if (IS_ERR(rb_node)) {
+ em = ERR_PTR(PTR_ERR(rb_node));
+ goto out;
+ }
+ em = rb_entry(rb_node, struct extent_map, rb_node);
+ goto found;
+
+ em = NULL;
+ goto out;
+
+found:
+ atomic_inc(&em->refs);
+out:
+ return em;
+}
+
+/**
* remove_extent_mapping - removes an extent_map from the extent tree
* @tree: extent tree to remove from
* @em: extent map beeing removed
@@ -331,7 +427,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
int ret = 0;
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
- assert_spin_locked(&tree->lock);
rb_erase(&em->rb_node, &tree->map);
em->in_tree = 0;
return ret;
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index fb6eeef06bb0..ab6d74b6e647 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -31,7 +31,7 @@ struct extent_map {
struct extent_map_tree {
struct rb_root map;
- spinlock_t lock;
+ rwlock_t lock;
};
static inline u64 extent_map_end(struct extent_map *em)
@@ -59,4 +59,7 @@ struct extent_map *alloc_extent_map(gfp_t mask);
void free_extent_map(struct extent_map *em);
int __init extent_map_init(void);
void extent_map_exit(void);
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
+struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len);
#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 126477eaecf5..f19e1259a971 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -22,7 +22,6 @@
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
-#include <linux/smp_lock.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
#include <linux/swap.h>
@@ -113,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
int err = 0;
int i;
struct inode *inode = fdentry(file)->d_inode;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
- u64 hint_byte;
u64 num_bytes;
u64 start_pos;
u64 end_of_last_block;
@@ -126,23 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
end_of_last_block = start_pos + num_bytes - 1;
+ err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
+ if (err)
+ return err;
- lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
- trans = btrfs_join_transaction(root, 1);
- if (!trans) {
- err = -ENOMEM;
- goto out_unlock;
- }
- btrfs_set_trans_block_group(trans, inode);
- hint_byte = 0;
-
- set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
-
- /* check for reserved extents on each page, we don't want
- * to reset the delalloc bit on things that already have
- * extents reserved.
- */
- btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
SetPageUptodate(p);
@@ -151,11 +135,11 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
}
if (end_pos > isize) {
i_size_write(inode, end_pos);
- btrfs_update_inode(trans, root, inode);
+ /* we've only changed i_size in ram, and we haven't updated
+ * the disk i_size. There is no need to log the inode
+ * at this time.
+ */
}
- err = btrfs_end_transaction(trans, root);
-out_unlock:
- unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
return err;
}
@@ -187,18 +171,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
if (!split2)
split2 = alloc_extent_map(GFP_NOFS);
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (!em) {
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
break;
}
flags = em->flags;
if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
- spin_unlock(&em_tree->lock);
if (em->start <= start &&
(!testend || em->start + em->len >= start + len)) {
free_extent_map(em);
+ write_unlock(&em_tree->lock);
break;
}
if (start < em->start) {
@@ -208,6 +192,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
start = em->start + em->len;
}
free_extent_map(em);
+ write_unlock(&em_tree->lock);
continue;
}
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
@@ -258,7 +243,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
free_extent_map(split);
split = NULL;
}
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
/* once for us */
free_extent_map(em);
@@ -287,7 +272,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
u64 start, u64 end, u64 locked_end,
- u64 inline_limit, u64 *hint_byte)
+ u64 inline_limit, u64 *hint_byte, int drop_cache)
{
u64 extent_end = 0;
u64 search_start = start;
@@ -312,7 +297,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
int ret;
inline_limit = 0;
- btrfs_drop_extent_cache(inode, start, end - 1, 0);
+ if (drop_cache)
+ btrfs_drop_extent_cache(inode, start, end - 1, 0);
path = btrfs_alloc_path();
if (!path)
@@ -934,21 +920,35 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
start_pos = pos;
vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+ /* do the reserve before the mutex lock in case we have to do some
+ * flushing. We wouldn't deadlock, but this is more polite.
+ */
+ err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+ if (err)
+ goto out_nolock;
+
+ mutex_lock(&inode->i_mutex);
+
current->backing_dev_info = inode->i_mapping->backing_dev_info;
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
- goto out_nolock;
+ goto out;
+
if (count == 0)
- goto out_nolock;
+ goto out;
err = file_remove_suid(file);
if (err)
- goto out_nolock;
+ goto out;
+
file_update_time(file);
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
- mutex_lock(&inode->i_mutex);
+ /* generic_write_checks can change our pos */
+ start_pos = pos;
+
BTRFS_I(inode)->sequence++;
first_index = pos >> PAGE_CACHE_SHIFT;
last_index = (pos + count) >> PAGE_CACHE_SHIFT;
@@ -1022,9 +1022,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
}
if (will_write) {
- btrfs_fdatawrite_range(inode->i_mapping, pos,
- pos + write_bytes - 1,
- WB_SYNC_ALL);
+ filemap_fdatawrite_range(inode->i_mapping, pos,
+ pos + write_bytes - 1);
} else {
balance_dirty_pages_ratelimited_nr(inode->i_mapping,
num_pages);
@@ -1045,6 +1044,7 @@ out:
mutex_unlock(&inode->i_mutex);
if (ret)
err = ret;
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
out_nolock:
kfree(pages);
@@ -1201,7 +1201,7 @@ out:
return ret > 0 ? EIO : ret;
}
-static struct vm_operations_struct btrfs_file_vm_ops = {
+static const struct vm_operations_struct btrfs_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = btrfs_page_mkwrite,
};
@@ -1213,7 +1213,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
return 0;
}
-struct file_operations btrfs_file_operations = {
+const struct file_operations btrfs_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
.aio_read = generic_file_aio_read,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4538e48581a5..5c2caad76212 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -16,45 +16,46 @@
* Boston, MA 021110-1307, USA.
*/
+#include <linux/pagemap.h>
#include <linux/sched.h>
+#include <linux/math64.h>
#include "ctree.h"
#include "free-space-cache.h"
#include "transaction.h"
-struct btrfs_free_space {
- struct rb_node bytes_index;
- struct rb_node offset_index;
- u64 offset;
- u64 bytes;
-};
+#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
+#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
-static int tree_insert_offset(struct rb_root *root, u64 offset,
- struct rb_node *node)
+static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
+ u64 offset)
{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct btrfs_free_space *info;
+ BUG_ON(offset < bitmap_start);
+ offset -= bitmap_start;
+ return (unsigned long)(div64_u64(offset, sectorsize));
+}
- while (*p) {
- parent = *p;
- info = rb_entry(parent, struct btrfs_free_space, offset_index);
+static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
+{
+ return (unsigned long)(div64_u64(bytes, sectorsize));
+}
- if (offset < info->offset)
- p = &(*p)->rb_left;
- else if (offset > info->offset)
- p = &(*p)->rb_right;
- else
- return -EEXIST;
- }
+static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
+ u64 offset)
+{
+ u64 bitmap_start;
+ u64 bytes_per_bitmap;
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
+ bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
+ bitmap_start = offset - block_group->key.objectid;
+ bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
+ bitmap_start *= bytes_per_bitmap;
+ bitmap_start += block_group->key.objectid;
- return 0;
+ return bitmap_start;
}
-static int tree_insert_bytes(struct rb_root *root, u64 bytes,
- struct rb_node *node)
+static int tree_insert_offset(struct rb_root *root, u64 offset,
+ struct rb_node *node, int bitmap)
{
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
@@ -62,12 +63,34 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
while (*p) {
parent = *p;
- info = rb_entry(parent, struct btrfs_free_space, bytes_index);
+ info = rb_entry(parent, struct btrfs_free_space, offset_index);
- if (bytes < info->bytes)
+ if (offset < info->offset) {
p = &(*p)->rb_left;
- else
+ } else if (offset > info->offset) {
p = &(*p)->rb_right;
+ } else {
+ /*
+ * we could have a bitmap entry and an extent entry
+ * share the same offset. If this is the case, we want
+ * the extent entry to always be found first if we do a
+ * linear search through the tree, since we want to have
+ * the quickest allocation time, and allocating from an
+ * extent is faster than allocating from a bitmap. So
+ * if we're inserting a bitmap and we find an entry at
+ * this offset, we want to go right, or after this entry
+ * logically. If we are inserting an extent and we've
+ * found a bitmap, we want to go left, or before
+ * logically.
+ */
+ if (bitmap) {
+ WARN_ON(info->bitmap);
+ p = &(*p)->rb_right;
+ } else {
+ WARN_ON(!info->bitmap);
+ p = &(*p)->rb_left;
+ }
+ }
}
rb_link_node(node, parent, p);
@@ -79,110 +102,143 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
/*
* searches the tree for the given offset.
*
- * fuzzy == 1: this is used for allocations where we are given a hint of where
- * to look for free space. Because the hint may not be completely on an offset
- * mark, or the hint may no longer point to free space we need to fudge our
- * results a bit. So we look for free space starting at or after offset with at
- * least bytes size. We prefer to find as close to the given offset as we can.
- * Also if the offset is within a free space range, then we will return the free
- * space that contains the given offset, which means we can return a free space
- * chunk with an offset before the provided offset.
- *
- * fuzzy == 0: this is just a normal tree search. Give us the free space that
- * starts at the given offset which is at least bytes size, and if its not there
- * return NULL.
+ * fuzzy - If this is set, then we are trying to make an allocation, and we just
+ * want a section that has at least bytes size and comes at or after the given
+ * offset.
*/
-static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
- u64 offset, u64 bytes,
- int fuzzy)
+static struct btrfs_free_space *
+tree_search_offset(struct btrfs_block_group_cache *block_group,
+ u64 offset, int bitmap_only, int fuzzy)
{
- struct rb_node *n = root->rb_node;
- struct btrfs_free_space *entry, *ret = NULL;
+ struct rb_node *n = block_group->free_space_offset.rb_node;
+ struct btrfs_free_space *entry, *prev = NULL;
+
+ /* find entry that is closest to the 'offset' */
+ while (1) {
+ if (!n) {
+ entry = NULL;
+ break;
+ }
- while (n) {
entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ prev = entry;
- if (offset < entry->offset) {
- if (fuzzy &&
- (!ret || entry->offset < ret->offset) &&
- (bytes <= entry->bytes))
- ret = entry;
+ if (offset < entry->offset)
n = n->rb_left;
- } else if (offset > entry->offset) {
- if (fuzzy &&
- (entry->offset + entry->bytes - 1) >= offset &&
- bytes <= entry->bytes) {
- ret = entry;
- break;
- }
+ else if (offset > entry->offset)
n = n->rb_right;
- } else {
- if (bytes > entry->bytes) {
- n = n->rb_right;
- continue;
- }
- ret = entry;
+ else
break;
- }
}
- return ret;
-}
+ if (bitmap_only) {
+ if (!entry)
+ return NULL;
+ if (entry->bitmap)
+ return entry;
-/*
- * return a chunk at least bytes size, as close to offset that we can get.
- */
-static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
- u64 offset, u64 bytes)
-{
- struct rb_node *n = root->rb_node;
- struct btrfs_free_space *entry, *ret = NULL;
-
- while (n) {
- entry = rb_entry(n, struct btrfs_free_space, bytes_index);
+ /*
+ * bitmap entry and extent entry may share same offset,
+ * in that case, bitmap entry comes after extent entry.
+ */
+ n = rb_next(n);
+ if (!n)
+ return NULL;
+ entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ if (entry->offset != offset)
+ return NULL;
- if (bytes < entry->bytes) {
+ WARN_ON(!entry->bitmap);
+ return entry;
+ } else if (entry) {
+ if (entry->bitmap) {
/*
- * We prefer to get a hole size as close to the size we
- * are asking for so we don't take small slivers out of
- * huge holes, but we also want to get as close to the
- * offset as possible so we don't have a whole lot of
- * fragmentation.
+ * if previous extent entry covers the offset,
+ * we should return it instead of the bitmap entry
*/
- if (offset <= entry->offset) {
- if (!ret)
- ret = entry;
- else if (entry->bytes < ret->bytes)
- ret = entry;
- else if (entry->offset < ret->offset)
- ret = entry;
+ n = &entry->offset_index;
+ while (1) {
+ n = rb_prev(n);
+ if (!n)
+ break;
+ prev = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ if (!prev->bitmap) {
+ if (prev->offset + prev->bytes > offset)
+ entry = prev;
+ break;
+ }
}
- n = n->rb_left;
- } else if (bytes > entry->bytes) {
- n = n->rb_right;
+ }
+ return entry;
+ }
+
+ if (!prev)
+ return NULL;
+
+ /* find last entry before the 'offset' */
+ entry = prev;
+ if (entry->offset > offset) {
+ n = rb_prev(&entry->offset_index);
+ if (n) {
+ entry = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ BUG_ON(entry->offset > offset);
} else {
- /*
- * Ok we may have multiple chunks of the wanted size,
- * so we don't want to take the first one we find, we
- * want to take the one closest to our given offset, so
- * keep searching just in case theres a better match.
- */
- n = n->rb_right;
- if (offset > entry->offset)
- continue;
- else if (!ret || entry->offset < ret->offset)
- ret = entry;
+ if (fuzzy)
+ return entry;
+ else
+ return NULL;
}
}
- return ret;
+ if (entry->bitmap) {
+ n = &entry->offset_index;
+ while (1) {
+ n = rb_prev(n);
+ if (!n)
+ break;
+ prev = rb_entry(n, struct btrfs_free_space,
+ offset_index);
+ if (!prev->bitmap) {
+ if (prev->offset + prev->bytes > offset)
+ return prev;
+ break;
+ }
+ }
+ if (entry->offset + BITS_PER_BITMAP *
+ block_group->sectorsize > offset)
+ return entry;
+ } else if (entry->offset + entry->bytes > offset)
+ return entry;
+
+ if (!fuzzy)
+ return NULL;
+
+ while (1) {
+ if (entry->bitmap) {
+ if (entry->offset + BITS_PER_BITMAP *
+ block_group->sectorsize > offset)
+ break;
+ } else {
+ if (entry->offset + entry->bytes > offset)
+ break;
+ }
+
+ n = rb_next(&entry->offset_index);
+ if (!n)
+ return NULL;
+ entry = rb_entry(n, struct btrfs_free_space, offset_index);
+ }
+ return entry;
}
static void unlink_free_space(struct btrfs_block_group_cache *block_group,
struct btrfs_free_space *info)
{
rb_erase(&info->offset_index, &block_group->free_space_offset);
- rb_erase(&info->bytes_index, &block_group->free_space_bytes);
+ block_group->free_extents--;
+ block_group->free_space -= info->bytes;
}
static int link_free_space(struct btrfs_block_group_cache *block_group,
@@ -190,17 +246,361 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
{
int ret = 0;
-
- BUG_ON(!info->bytes);
+ BUG_ON(!info->bitmap && !info->bytes);
ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
- &info->offset_index);
+ &info->offset_index, (info->bitmap != NULL));
if (ret)
return ret;
- ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
- &info->bytes_index);
- if (ret)
- return ret;
+ block_group->free_space += info->bytes;
+ block_group->free_extents++;
+ return ret;
+}
+
+static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
+{
+ u64 max_bytes;
+ u64 bitmap_bytes;
+ u64 extent_bytes;
+
+ /*
+ * The goal is to keep the total amount of memory used per 1gb of space
+ * at or below 32k, so we need to adjust how much memory we allow to be
+ * used by extent based free space tracking
+ */
+ max_bytes = MAX_CACHE_BYTES_PER_GIG *
+ (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
+
+ /*
+ * we want to account for 1 more bitmap than what we have so we can make
+ * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
+ * we add more bitmaps.
+ */
+ bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
+
+ if (bitmap_bytes >= max_bytes) {
+ block_group->extents_thresh = 0;
+ return;
+ }
+
+ /*
+ * we want the extent entry threshold to always be at most 1/2 the maxw
+ * bytes we can have, or whatever is less than that.
+ */
+ extent_bytes = max_bytes - bitmap_bytes;
+ extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
+
+ block_group->extents_thresh =
+ div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
+}
+
+static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *info, u64 offset,
+ u64 bytes)
+{
+ unsigned long start, end;
+ unsigned long i;
+
+ start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+ end = start + bytes_to_bits(bytes, block_group->sectorsize);
+ BUG_ON(end > BITS_PER_BITMAP);
+
+ for (i = start; i < end; i++)
+ clear_bit(i, info->bitmap);
+
+ info->bytes -= bytes;
+ block_group->free_space -= bytes;
+}
+
+static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *info, u64 offset,
+ u64 bytes)
+{
+ unsigned long start, end;
+ unsigned long i;
+
+ start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+ end = start + bytes_to_bits(bytes, block_group->sectorsize);
+ BUG_ON(end > BITS_PER_BITMAP);
+
+ for (i = start; i < end; i++)
+ set_bit(i, info->bitmap);
+
+ info->bytes += bytes;
+ block_group->free_space += bytes;
+}
+
+static int search_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *bitmap_info, u64 *offset,
+ u64 *bytes)
+{
+ unsigned long found_bits = 0;
+ unsigned long bits, i;
+ unsigned long next_zero;
+
+ i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
+ max_t(u64, *offset, bitmap_info->offset));
+ bits = bytes_to_bits(*bytes, block_group->sectorsize);
+
+ for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
+ i < BITS_PER_BITMAP;
+ i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
+ next_zero = find_next_zero_bit(bitmap_info->bitmap,
+ BITS_PER_BITMAP, i);
+ if ((next_zero - i) >= bits) {
+ found_bits = next_zero - i;
+ break;
+ }
+ i = next_zero;
+ }
+
+ if (found_bits) {
+ *offset = (u64)(i * block_group->sectorsize) +
+ bitmap_info->offset;
+ *bytes = (u64)(found_bits) * block_group->sectorsize;
+ return 0;
+ }
+
+ return -1;
+}
+
+static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
+ *block_group, u64 *offset,
+ u64 *bytes, int debug)
+{
+ struct btrfs_free_space *entry;
+ struct rb_node *node;
+ int ret;
+
+ if (!block_group->free_space_offset.rb_node)
+ return NULL;
+
+ entry = tree_search_offset(block_group,
+ offset_to_bitmap(block_group, *offset),
+ 0, 1);
+ if (!entry)
+ return NULL;
+
+ for (node = &entry->offset_index; node; node = rb_next(node)) {
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ if (entry->bytes < *bytes)
+ continue;
+
+ if (entry->bitmap) {
+ ret = search_bitmap(block_group, entry, offset, bytes);
+ if (!ret)
+ return entry;
+ continue;
+ }
+
+ *offset = entry->offset;
+ *bytes = entry->bytes;
+ return entry;
+ }
+
+ return NULL;
+}
+
+static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *info, u64 offset)
+{
+ u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+ int max_bitmaps = (int)div64_u64(block_group->key.offset +
+ bytes_per_bg - 1, bytes_per_bg);
+ BUG_ON(block_group->total_bitmaps >= max_bitmaps);
+
+ info->offset = offset_to_bitmap(block_group, offset);
+ info->bytes = 0;
+ link_free_space(block_group, info);
+ block_group->total_bitmaps++;
+
+ recalculate_thresholds(block_group);
+}
+
+static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *bitmap_info,
+ u64 *offset, u64 *bytes)
+{
+ u64 end;
+ u64 search_start, search_bytes;
+ int ret;
+
+again:
+ end = bitmap_info->offset +
+ (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
+
+ /*
+ * XXX - this can go away after a few releases.
+ *
+ * since the only user of btrfs_remove_free_space is the tree logging
+ * stuff, and the only way to test that is under crash conditions, we
+ * want to have this debug stuff here just in case somethings not
+ * working. Search the bitmap for the space we are trying to use to
+ * make sure its actually there. If its not there then we need to stop
+ * because something has gone wrong.
+ */
+ search_start = *offset;
+ search_bytes = *bytes;
+ ret = search_bitmap(block_group, bitmap_info, &search_start,
+ &search_bytes);
+ BUG_ON(ret < 0 || search_start != *offset);
+
+ if (*offset > bitmap_info->offset && *offset + *bytes > end) {
+ bitmap_clear_bits(block_group, bitmap_info, *offset,
+ end - *offset + 1);
+ *bytes -= end - *offset + 1;
+ *offset = end + 1;
+ } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
+ bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
+ *bytes = 0;
+ }
+
+ if (*bytes) {
+ struct rb_node *next = rb_next(&bitmap_info->offset_index);
+ if (!bitmap_info->bytes) {
+ unlink_free_space(block_group, bitmap_info);
+ kfree(bitmap_info->bitmap);
+ kfree(bitmap_info);
+ block_group->total_bitmaps--;
+ recalculate_thresholds(block_group);
+ }
+
+ /*
+ * no entry after this bitmap, but we still have bytes to
+ * remove, so something has gone wrong.
+ */
+ if (!next)
+ return -EINVAL;
+
+ bitmap_info = rb_entry(next, struct btrfs_free_space,
+ offset_index);
+
+ /*
+ * if the next entry isn't a bitmap we need to return to let the
+ * extent stuff do its work.
+ */
+ if (!bitmap_info->bitmap)
+ return -EAGAIN;
+
+ /*
+ * Ok the next item is a bitmap, but it may not actually hold
+ * the information for the rest of this free space stuff, so
+ * look for it, and if we don't find it return so we can try
+ * everything over again.
+ */
+ search_start = *offset;
+ search_bytes = *bytes;
+ ret = search_bitmap(block_group, bitmap_info, &search_start,
+ &search_bytes);
+ if (ret < 0 || search_start != *offset)
+ return -EAGAIN;
+
+ goto again;
+ } else if (!bitmap_info->bytes) {
+ unlink_free_space(block_group, bitmap_info);
+ kfree(bitmap_info->bitmap);
+ kfree(bitmap_info);
+ block_group->total_bitmaps--;
+ recalculate_thresholds(block_group);
+ }
+
+ return 0;
+}
+
+static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *info)
+{
+ struct btrfs_free_space *bitmap_info;
+ int added = 0;
+ u64 bytes, offset, end;
+ int ret;
+
+ /*
+ * If we are below the extents threshold then we can add this as an
+ * extent, and don't have to deal with the bitmap
+ */
+ if (block_group->free_extents < block_group->extents_thresh &&
+ info->bytes > block_group->sectorsize * 4)
+ return 0;
+
+ /*
+ * some block groups are so tiny they can't be enveloped by a bitmap, so
+ * don't even bother to create a bitmap for this
+ */
+ if (BITS_PER_BITMAP * block_group->sectorsize >
+ block_group->key.offset)
+ return 0;
+
+ bytes = info->bytes;
+ offset = info->offset;
+
+again:
+ bitmap_info = tree_search_offset(block_group,
+ offset_to_bitmap(block_group, offset),
+ 1, 0);
+ if (!bitmap_info) {
+ BUG_ON(added);
+ goto new_bitmap;
+ }
+
+ end = bitmap_info->offset +
+ (u64)(BITS_PER_BITMAP * block_group->sectorsize);
+
+ if (offset >= bitmap_info->offset && offset + bytes > end) {
+ bitmap_set_bits(block_group, bitmap_info, offset,
+ end - offset);
+ bytes -= end - offset;
+ offset = end;
+ added = 0;
+ } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
+ bitmap_set_bits(block_group, bitmap_info, offset, bytes);
+ bytes = 0;
+ } else {
+ BUG();
+ }
+
+ if (!bytes) {
+ ret = 1;
+ goto out;
+ } else
+ goto again;
+
+new_bitmap:
+ if (info && info->bitmap) {
+ add_new_bitmap(block_group, info, offset);
+ added = 1;
+ info = NULL;
+ goto again;
+ } else {
+ spin_unlock(&block_group->tree_lock);
+
+ /* no pre-allocated info, allocate a new one */
+ if (!info) {
+ info = kzalloc(sizeof(struct btrfs_free_space),
+ GFP_NOFS);
+ if (!info) {
+ spin_lock(&block_group->tree_lock);
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* allocate the bitmap */
+ info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+ spin_lock(&block_group->tree_lock);
+ if (!info->bitmap) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ goto again;
+ }
+
+out:
+ if (info) {
+ if (info->bitmap)
+ kfree(info->bitmap);
+ kfree(info);
+ }
return ret;
}
@@ -208,8 +608,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
u64 offset, u64 bytes)
{
- struct btrfs_free_space *right_info;
- struct btrfs_free_space *left_info;
+ struct btrfs_free_space *right_info = NULL;
+ struct btrfs_free_space *left_info = NULL;
struct btrfs_free_space *info = NULL;
int ret = 0;
@@ -227,18 +627,38 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
* are adding, if there is remove that struct and add a new one to
* cover the entire range
*/
- right_info = tree_search_offset(&block_group->free_space_offset,
- offset+bytes, 0, 0);
- left_info = tree_search_offset(&block_group->free_space_offset,
- offset-1, 0, 1);
+ right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
+ if (right_info && rb_prev(&right_info->offset_index))
+ left_info = rb_entry(rb_prev(&right_info->offset_index),
+ struct btrfs_free_space, offset_index);
+ else
+ left_info = tree_search_offset(block_group, offset - 1, 0, 0);
+
+ /*
+ * If there was no extent directly to the left or right of this new
+ * extent then we know we're going to have to allocate a new extent, so
+ * before we do that see if we need to drop this into a bitmap
+ */
+ if ((!left_info || left_info->bitmap) &&
+ (!right_info || right_info->bitmap)) {
+ ret = insert_into_bitmap(block_group, info);
+
+ if (ret < 0) {
+ goto out;
+ } else if (ret) {
+ ret = 0;
+ goto out;
+ }
+ }
- if (right_info) {
+ if (right_info && !right_info->bitmap) {
unlink_free_space(block_group, right_info);
info->bytes += right_info->bytes;
kfree(right_info);
}
- if (left_info && left_info->offset + left_info->bytes == offset) {
+ if (left_info && !left_info->bitmap &&
+ left_info->offset + left_info->bytes == offset) {
unlink_free_space(block_group, left_info);
info->offset = left_info->offset;
info->bytes += left_info->bytes;
@@ -248,11 +668,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
ret = link_free_space(block_group, info);
if (ret)
kfree(info);
-
+out:
spin_unlock(&block_group->tree_lock);
if (ret) {
- printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
+ printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
BUG_ON(ret == -EEXIST);
}
@@ -263,40 +683,74 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
u64 offset, u64 bytes)
{
struct btrfs_free_space *info;
+ struct btrfs_free_space *next_info = NULL;
int ret = 0;
spin_lock(&block_group->tree_lock);
- info = tree_search_offset(&block_group->free_space_offset, offset, 0,
- 1);
- if (info && info->offset == offset) {
- if (info->bytes < bytes) {
- printk(KERN_ERR "Found free space at %llu, size %llu,"
- "trying to use %llu\n",
- (unsigned long long)info->offset,
- (unsigned long long)info->bytes,
- (unsigned long long)bytes);
+again:
+ info = tree_search_offset(block_group, offset, 0, 0);
+ if (!info) {
+ /*
+ * oops didn't find an extent that matched the space we wanted
+ * to remove, look for a bitmap instead
+ */
+ info = tree_search_offset(block_group,
+ offset_to_bitmap(block_group, offset),
+ 1, 0);
+ if (!info) {
+ WARN_ON(1);
+ goto out_lock;
+ }
+ }
+
+ if (info->bytes < bytes && rb_next(&info->offset_index)) {
+ u64 end;
+ next_info = rb_entry(rb_next(&info->offset_index),
+ struct btrfs_free_space,
+ offset_index);
+
+ if (next_info->bitmap)
+ end = next_info->offset + BITS_PER_BITMAP *
+ block_group->sectorsize - 1;
+ else
+ end = next_info->offset + next_info->bytes;
+
+ if (next_info->bytes < bytes ||
+ next_info->offset > offset || offset > end) {
+ printk(KERN_CRIT "Found free space at %llu, size %llu,"
+ " trying to use %llu\n",
+ (unsigned long long)info->offset,
+ (unsigned long long)info->bytes,
+ (unsigned long long)bytes);
WARN_ON(1);
ret = -EINVAL;
- spin_unlock(&block_group->tree_lock);
- goto out;
+ goto out_lock;
}
- unlink_free_space(block_group, info);
- if (info->bytes == bytes) {
- kfree(info);
- spin_unlock(&block_group->tree_lock);
- goto out;
+ info = next_info;
+ }
+
+ if (info->bytes == bytes) {
+ unlink_free_space(block_group, info);
+ if (info->bitmap) {
+ kfree(info->bitmap);
+ block_group->total_bitmaps--;
}
+ kfree(info);
+ goto out_lock;
+ }
+ if (!info->bitmap && info->offset == offset) {
+ unlink_free_space(block_group, info);
info->offset += bytes;
info->bytes -= bytes;
+ link_free_space(block_group, info);
+ goto out_lock;
+ }
- ret = link_free_space(block_group, info);
- spin_unlock(&block_group->tree_lock);
- BUG_ON(ret);
- } else if (info && info->offset < offset &&
- info->offset + info->bytes >= offset + bytes) {
+ if (!info->bitmap && info->offset <= offset &&
+ info->offset + info->bytes >= offset + bytes) {
u64 old_start = info->offset;
/*
* we're freeing space in the middle of the info,
@@ -312,7 +766,9 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
info->offset = offset + bytes;
info->bytes = old_end - info->offset;
ret = link_free_space(block_group, info);
- BUG_ON(ret);
+ WARN_ON(ret);
+ if (ret)
+ goto out_lock;
} else {
/* the hole we're creating ends at the end
* of the info struct, just free the info
@@ -320,32 +776,22 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
kfree(info);
}
spin_unlock(&block_group->tree_lock);
- /* step two, insert a new info struct to cover anything
- * before the hole
+
+ /* step two, insert a new info struct to cover
+ * anything before the hole
*/
ret = btrfs_add_free_space(block_group, old_start,
offset - old_start);
- BUG_ON(ret);
- } else {
- spin_unlock(&block_group->tree_lock);
- if (!info) {
- printk(KERN_ERR "couldn't find space %llu to free\n",
- (unsigned long long)offset);
- printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
- block_group->cached,
- (unsigned long long)block_group->key.objectid,
- (unsigned long long)block_group->key.offset);
- btrfs_dump_free_space(block_group, bytes);
- } else if (info) {
- printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
- "but wanted offset=%llu bytes=%llu\n",
- (unsigned long long)info->offset,
- (unsigned long long)info->bytes,
- (unsigned long long)offset,
- (unsigned long long)bytes);
- }
- WARN_ON(1);
+ WARN_ON(ret);
+ goto out;
}
+
+ ret = remove_from_bitmap(block_group, info, &offset, &bytes);
+ if (ret == -EAGAIN)
+ goto again;
+ BUG_ON(ret);
+out_lock:
+ spin_unlock(&block_group->tree_lock);
out:
return ret;
}
@@ -361,10 +807,13 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
info = rb_entry(n, struct btrfs_free_space, offset_index);
if (info->bytes >= bytes)
count++;
- printk(KERN_ERR "entry offset %llu, bytes %llu\n",
+ printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
(unsigned long long)info->offset,
- (unsigned long long)info->bytes);
+ (unsigned long long)info->bytes,
+ (info->bitmap) ? "yes" : "no");
}
+ printk(KERN_INFO "block group has cluster?: %s\n",
+ list_empty(&block_group->cluster_list) ? "no" : "yes");
printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
"\n", count);
}
@@ -397,26 +846,35 @@ __btrfs_return_cluster_to_free_space(
{
struct btrfs_free_space *entry;
struct rb_node *node;
+ bool bitmap;
spin_lock(&cluster->lock);
if (cluster->block_group != block_group)
goto out;
+ bitmap = cluster->points_to_bitmap;
+ cluster->block_group = NULL;
cluster->window_start = 0;
+ list_del_init(&cluster->block_group_list);
+ cluster->points_to_bitmap = false;
+
+ if (bitmap)
+ goto out;
+
node = rb_first(&cluster->root);
- while(node) {
+ while (node) {
entry = rb_entry(node, struct btrfs_free_space, offset_index);
node = rb_next(&entry->offset_index);
rb_erase(&entry->offset_index, &cluster->root);
- link_free_space(block_group, entry);
+ BUG_ON(entry->bitmap);
+ tree_insert_offset(&block_group->free_space_offset,
+ entry->offset, &entry->offset_index, 0);
}
- list_del_init(&cluster->block_group_list);
-
- btrfs_put_block_group(cluster->block_group);
- cluster->block_group = NULL;
cluster->root.rb_node = NULL;
+
out:
spin_unlock(&cluster->lock);
+ btrfs_put_block_group(block_group);
return 0;
}
@@ -425,20 +883,28 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
struct btrfs_free_space *info;
struct rb_node *node;
struct btrfs_free_cluster *cluster;
- struct btrfs_free_cluster *safe;
+ struct list_head *head;
spin_lock(&block_group->tree_lock);
-
- list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
- block_group_list) {
+ while ((head = block_group->cluster_list.next) !=
+ &block_group->cluster_list) {
+ cluster = list_entry(head, struct btrfs_free_cluster,
+ block_group_list);
WARN_ON(cluster->block_group != block_group);
__btrfs_return_cluster_to_free_space(block_group, cluster);
+ if (need_resched()) {
+ spin_unlock(&block_group->tree_lock);
+ cond_resched();
+ spin_lock(&block_group->tree_lock);
+ }
}
- while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
- info = rb_entry(node, struct btrfs_free_space, bytes_index);
+ while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
+ info = rb_entry(node, struct btrfs_free_space, offset_index);
unlink_free_space(block_group, info);
+ if (info->bitmap)
+ kfree(info->bitmap);
kfree(info);
if (need_resched()) {
spin_unlock(&block_group->tree_lock);
@@ -446,6 +912,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
spin_lock(&block_group->tree_lock);
}
}
+
spin_unlock(&block_group->tree_lock);
}
@@ -453,25 +920,35 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
u64 offset, u64 bytes, u64 empty_size)
{
struct btrfs_free_space *entry = NULL;
+ u64 bytes_search = bytes + empty_size;
u64 ret = 0;
spin_lock(&block_group->tree_lock);
- entry = tree_search_offset(&block_group->free_space_offset, offset,
- bytes + empty_size, 1);
+ entry = find_free_space(block_group, &offset, &bytes_search, 0);
if (!entry)
- entry = tree_search_bytes(&block_group->free_space_bytes,
- offset, bytes + empty_size);
- if (entry) {
+ goto out;
+
+ ret = offset;
+ if (entry->bitmap) {
+ bitmap_clear_bits(block_group, entry, offset, bytes);
+ if (!entry->bytes) {
+ unlink_free_space(block_group, entry);
+ kfree(entry->bitmap);
+ kfree(entry);
+ block_group->total_bitmaps--;
+ recalculate_thresholds(block_group);
+ }
+ } else {
unlink_free_space(block_group, entry);
- ret = entry->offset;
entry->offset += bytes;
entry->bytes -= bytes;
-
if (!entry->bytes)
kfree(entry);
else
link_free_space(block_group, entry);
}
+
+out:
spin_unlock(&block_group->tree_lock);
return ret;
@@ -517,6 +994,54 @@ int btrfs_return_cluster_to_free_space(
return ret;
}
+static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ u64 bytes, u64 min_start)
+{
+ struct btrfs_free_space *entry;
+ int err;
+ u64 search_start = cluster->window_start;
+ u64 search_bytes = bytes;
+ u64 ret = 0;
+
+ spin_lock(&block_group->tree_lock);
+ spin_lock(&cluster->lock);
+
+ if (!cluster->points_to_bitmap)
+ goto out;
+
+ if (cluster->block_group != block_group)
+ goto out;
+
+ /*
+ * search_start is the beginning of the bitmap, but at some point it may
+ * be a good idea to point to the actual start of the free area in the
+ * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
+ * to 1 to make sure we get the bitmap entry
+ */
+ entry = tree_search_offset(block_group,
+ offset_to_bitmap(block_group, search_start),
+ 1, 0);
+ if (!entry || !entry->bitmap)
+ goto out;
+
+ search_start = min_start;
+ search_bytes = bytes;
+
+ err = search_bitmap(block_group, entry, &search_start,
+ &search_bytes);
+ if (err)
+ goto out;
+
+ ret = search_start;
+ bitmap_clear_bits(block_group, entry, ret, bytes);
+out:
+ spin_unlock(&cluster->lock);
+ spin_unlock(&block_group->tree_lock);
+
+ return ret;
+}
+
/*
* given a cluster, try to allocate 'bytes' from it, returns 0
* if it couldn't find anything suitably large, or a logical disk offset
@@ -530,6 +1055,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
struct rb_node *node;
u64 ret = 0;
+ if (cluster->points_to_bitmap)
+ return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
+ min_start);
+
spin_lock(&cluster->lock);
if (bytes > cluster->max_size)
goto out;
@@ -567,9 +1096,73 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
}
out:
spin_unlock(&cluster->lock);
+
return ret;
}
+static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_space *entry,
+ struct btrfs_free_cluster *cluster,
+ u64 offset, u64 bytes, u64 min_bytes)
+{
+ unsigned long next_zero;
+ unsigned long i;
+ unsigned long search_bits;
+ unsigned long total_bits;
+ unsigned long found_bits;
+ unsigned long start = 0;
+ unsigned long total_found = 0;
+ bool found = false;
+
+ i = offset_to_bit(entry->offset, block_group->sectorsize,
+ max_t(u64, offset, entry->offset));
+ search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+ total_bits = bytes_to_bits(bytes, block_group->sectorsize);
+
+again:
+ found_bits = 0;
+ for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
+ i < BITS_PER_BITMAP;
+ i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
+ next_zero = find_next_zero_bit(entry->bitmap,
+ BITS_PER_BITMAP, i);
+ if (next_zero - i >= search_bits) {
+ found_bits = next_zero - i;
+ break;
+ }
+ i = next_zero;
+ }
+
+ if (!found_bits)
+ return -1;
+
+ if (!found) {
+ start = i;
+ found = true;
+ }
+
+ total_found += found_bits;
+
+ if (cluster->max_size < found_bits * block_group->sectorsize)
+ cluster->max_size = found_bits * block_group->sectorsize;
+
+ if (total_found < total_bits) {
+ i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
+ if (i - start > total_bits * 2) {
+ total_found = 0;
+ cluster->max_size = 0;
+ found = false;
+ }
+ goto again;
+ }
+
+ cluster->window_start = start * block_group->sectorsize +
+ entry->offset;
+ cluster->points_to_bitmap = true;
+
+ return 0;
+}
+
/*
* here we try to find a cluster of blocks in a block group. The goal
* is to find at least bytes free and up to empty_size + bytes free.
@@ -587,12 +1180,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
struct btrfs_free_space *entry = NULL;
struct rb_node *node;
struct btrfs_free_space *next;
- struct btrfs_free_space *last;
+ struct btrfs_free_space *last = NULL;
u64 min_bytes;
u64 window_start;
u64 window_free;
u64 max_extent = 0;
- int total_retries = 0;
+ bool found_bitmap = false;
int ret;
/* for metadata, allow allocates with more holes */
@@ -620,31 +1213,80 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
goto out;
}
again:
- min_bytes = min(min_bytes, bytes + empty_size);
- entry = tree_search_bytes(&block_group->free_space_bytes,
- offset, min_bytes);
+ entry = tree_search_offset(block_group, offset, found_bitmap, 1);
if (!entry) {
ret = -ENOSPC;
goto out;
}
+
+ /*
+ * If found_bitmap is true, we exhausted our search for extent entries,
+ * and we just want to search all of the bitmaps that we can find, and
+ * ignore any extent entries we find.
+ */
+ while (entry->bitmap || found_bitmap ||
+ (!entry->bitmap && entry->bytes < min_bytes)) {
+ struct rb_node *node = rb_next(&entry->offset_index);
+
+ if (entry->bitmap && entry->bytes > bytes + empty_size) {
+ ret = btrfs_bitmap_cluster(block_group, entry, cluster,
+ offset, bytes + empty_size,
+ min_bytes);
+ if (!ret)
+ goto got_it;
+ }
+
+ if (!node) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ entry = rb_entry(node, struct btrfs_free_space, offset_index);
+ }
+
+ /*
+ * We already searched all the extent entries from the passed in offset
+ * to the end and didn't find enough space for the cluster, and we also
+ * didn't find any bitmaps that met our criteria, just go ahead and exit
+ */
+ if (found_bitmap) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ cluster->points_to_bitmap = false;
window_start = entry->offset;
window_free = entry->bytes;
last = entry;
max_extent = entry->bytes;
- while(1) {
+ while (1) {
/* out window is just right, lets fill it */
if (window_free >= bytes + empty_size)
break;
node = rb_next(&last->offset_index);
if (!node) {
+ if (found_bitmap)
+ goto again;
ret = -ENOSPC;
goto out;
}
next = rb_entry(node, struct btrfs_free_space, offset_index);
/*
+ * we found a bitmap, so if this search doesn't result in a
+ * cluster, we know to go and search again for the bitmaps and
+ * start looking for space there
+ */
+ if (next->bitmap) {
+ if (!found_bitmap)
+ offset = next->offset;
+ found_bitmap = true;
+ last = next;
+ continue;
+ }
+
+ /*
* we haven't filled the empty size and the window is
* very large. reset and try again
*/
@@ -655,19 +1297,6 @@ again:
window_free = entry->bytes;
last = entry;
max_extent = 0;
- total_retries++;
- if (total_retries % 64 == 0) {
- if (min_bytes >= (bytes + empty_size)) {
- ret = -ENOSPC;
- goto out;
- }
- /*
- * grow our allocation a bit, we're not having
- * much luck
- */
- min_bytes *= 2;
- goto again;
- }
} else {
last = next;
window_free += next->bytes;
@@ -685,11 +1314,19 @@ again:
* The cluster includes an rbtree, but only uses the offset index
* of each free space cache entry.
*/
- while(1) {
+ while (1) {
node = rb_next(&entry->offset_index);
- unlink_free_space(block_group, entry);
+ if (entry->bitmap && node) {
+ entry = rb_entry(node, struct btrfs_free_space,
+ offset_index);
+ continue;
+ } else if (entry->bitmap && !node) {
+ break;
+ }
+
+ rb_erase(&entry->offset_index, &block_group->free_space_offset);
ret = tree_insert_offset(&cluster->root, entry->offset,
- &entry->offset_index);
+ &entry->offset_index, 0);
BUG_ON(ret);
if (!node || entry == last)
@@ -697,8 +1334,10 @@ again:
entry = rb_entry(node, struct btrfs_free_space, offset_index);
}
- ret = 0;
+
cluster->max_size = max_extent;
+got_it:
+ ret = 0;
atomic_inc(&block_group->count);
list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
cluster->block_group = block_group;
@@ -718,6 +1357,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
spin_lock_init(&cluster->refill_lock);
cluster->root.rb_node = NULL;
cluster->max_size = 0;
+ cluster->points_to_bitmap = false;
INIT_LIST_HEAD(&cluster->block_group_list);
cluster->block_group = NULL;
}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 266fb8764054..890a8e79011b 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -19,6 +19,14 @@
#ifndef __BTRFS_FREE_SPACE_CACHE
#define __BTRFS_FREE_SPACE_CACHE
+struct btrfs_free_space {
+ struct rb_node offset_index;
+ u64 offset;
+ u64 bytes;
+ unsigned long *bitmap;
+ struct list_head list;
+};
+
int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
u64 bytenr, u64 size);
int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 6b627c611808..72ce3c173d6a 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -149,6 +149,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)(ref + 1);
ret = 0;
} else if (ret < 0) {
+ if (ret == -EOVERFLOW)
+ ret = -EMLINK;
goto out;
} else {
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -177,8 +179,6 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(struct btrfs_inode_item));
- if (ret == 0 && objectid > root->highest_inode)
- root->highest_inode = objectid;
return ret;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 9abbced1123d..c56eb5909172 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -43,9 +43,10 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
slot = path->slots[0] - 1;
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
- *objectid = found_key.objectid;
+ *objectid = max_t(u64, found_key.objectid,
+ BTRFS_FIRST_FREE_OBJECTID - 1);
} else {
- *objectid = BTRFS_FIRST_FREE_OBJECTID;
+ *objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
}
ret = 0;
error:
@@ -53,91 +54,27 @@ error:
return ret;
}
-/*
- * walks the btree of allocated inodes and find a hole.
- */
int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 dirid, u64 *objectid)
{
- struct btrfs_path *path;
- struct btrfs_key key;
int ret;
- int slot = 0;
- u64 last_ino = 0;
- int start_found;
- struct extent_buffer *l;
- struct btrfs_key search_key;
- u64 search_start = dirid;
-
mutex_lock(&root->objectid_mutex);
- if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
- root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
- *objectid = ++root->last_inode_alloc;
- mutex_unlock(&root->objectid_mutex);
- return 0;
- }
- path = btrfs_alloc_path();
- BUG_ON(!path);
- search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
- search_key.objectid = search_start;
- search_key.type = 0;
- search_key.offset = 0;
-
- start_found = 0;
- ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
- if (ret < 0)
- goto error;
- while (1) {
- l = path->nodes[0];
- slot = path->slots[0];
- if (slot >= btrfs_header_nritems(l)) {
- ret = btrfs_next_leaf(root, path);
- if (ret == 0)
- continue;
- if (ret < 0)
- goto error;
- if (!start_found) {
- *objectid = search_start;
- start_found = 1;
- goto found;
- }
- *objectid = last_ino > search_start ?
- last_ino : search_start;
- goto found;
- }
- btrfs_item_key_to_cpu(l, &key, slot);
- if (key.objectid >= search_start) {
- if (start_found) {
- if (last_ino < search_start)
- last_ino = search_start;
- if (key.objectid > last_ino) {
- *objectid = last_ino;
- goto found;
- }
- } else if (key.objectid > search_start) {
- *objectid = search_start;
- goto found;
- }
- }
- if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
- break;
+ if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
+ ret = btrfs_find_highest_inode(root, &root->highest_objectid);
+ if (ret)
+ goto out;
+ }
- start_found = 1;
- last_ino = key.objectid + 1;
- path->slots[0]++;
+ if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
+ ret = -ENOSPC;
+ goto out;
}
- BUG_ON(1);
-found:
- btrfs_release_path(root, path);
- btrfs_free_path(path);
- BUG_ON(*objectid < search_start);
- mutex_unlock(&root->objectid_mutex);
- return 0;
-error:
- btrfs_release_path(root, path);
- btrfs_free_path(path);
+
+ *objectid = ++root->highest_objectid;
+ ret = 0;
+out:
mutex_unlock(&root->objectid_mutex);
return ret;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dbe1aabf96cd..112e5aa85892 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -26,7 +26,6 @@
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
-#include <linux/smp_lock.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
#include <linux/swap.h>
@@ -56,14 +55,14 @@ struct btrfs_iget_args {
struct btrfs_root *root;
};
-static struct inode_operations btrfs_dir_inode_operations;
-static struct inode_operations btrfs_symlink_inode_operations;
-static struct inode_operations btrfs_dir_ro_inode_operations;
-static struct inode_operations btrfs_special_inode_operations;
-static struct inode_operations btrfs_file_inode_operations;
-static struct address_space_operations btrfs_aops;
-static struct address_space_operations btrfs_symlink_aops;
-static struct file_operations btrfs_dir_file_operations;
+static const struct inode_operations btrfs_dir_inode_operations;
+static const struct inode_operations btrfs_symlink_inode_operations;
+static const struct inode_operations btrfs_dir_ro_inode_operations;
+static const struct inode_operations btrfs_special_inode_operations;
+static const struct inode_operations btrfs_file_inode_operations;
+static const struct address_space_operations btrfs_aops;
+static const struct address_space_operations btrfs_symlink_aops;
+static const struct file_operations btrfs_dir_file_operations;
static struct extent_io_ops btrfs_extent_io_ops;
static struct kmem_cache *btrfs_inode_cachep;
@@ -232,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
}
ret = btrfs_drop_extents(trans, root, inode, start,
- aligned_end, aligned_end, start, &hint_byte);
+ aligned_end, aligned_end, start,
+ &hint_byte, 1);
BUG_ON(ret);
if (isize > actual_end)
@@ -241,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
inline_len, compressed_size,
compressed_pages);
BUG_ON(ret);
- btrfs_drop_extent_cache(inode, start, aligned_end, 0);
+ btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
return 0;
}
@@ -426,7 +426,7 @@ again:
extent_clear_unlock_delalloc(inode,
&BTRFS_I(inode)->io_tree,
start, end, NULL, 1, 0,
- 0, 1, 1, 1);
+ 0, 1, 1, 1, 0);
ret = 0;
goto free_pages_out;
}
@@ -612,9 +612,9 @@ static noinline int submit_compressed_extents(struct inode *inode,
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
while (1) {
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
@@ -641,7 +641,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
async_extent->start,
async_extent->start +
async_extent->ram_size - 1,
- NULL, 1, 1, 0, 1, 1, 0);
+ NULL, 1, 1, 0, 1, 1, 0, 0);
ret = btrfs_submit_compressed_write(inode,
async_extent->start,
@@ -714,7 +714,7 @@ static noinline int cow_file_range(struct inode *inode,
extent_clear_unlock_delalloc(inode,
&BTRFS_I(inode)->io_tree,
start, end, NULL, 1, 1,
- 1, 1, 1, 1);
+ 1, 1, 1, 1, 0);
*nr_written = *nr_written +
(end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
*page_started = 1;
@@ -726,6 +726,15 @@ static noinline int cow_file_range(struct inode *inode,
BUG_ON(disk_num_bytes >
btrfs_super_total_bytes(&root->fs_info->super_copy));
+
+ read_lock(&BTRFS_I(inode)->extent_tree.lock);
+ em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
+ start, num_bytes);
+ if (em) {
+ alloc_hint = em->block_start;
+ free_extent_map(em);
+ }
+ read_unlock(&BTRFS_I(inode)->extent_tree.lock);
btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
while (disk_num_bytes > 0) {
@@ -738,7 +747,6 @@ static noinline int cow_file_range(struct inode *inode,
em = alloc_extent_map(GFP_NOFS);
em->start = start;
em->orig_start = em->start;
-
ram_size = ins.offset;
em->len = ins.offset;
@@ -748,9 +756,9 @@ static noinline int cow_file_range(struct inode *inode,
set_bit(EXTENT_FLAG_PINNED, &em->flags);
while (1) {
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
@@ -777,11 +785,14 @@ static noinline int cow_file_range(struct inode *inode,
/* we're not doing compressed IO, don't unlock the first
* page (which the caller expects to stay locked), don't
* clear any dirty bits and don't set any writeback bits
+ *
+ * Do set the Private2 bit so we know this page was properly
+ * setup for writepage
*/
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
start, start + ram_size - 1,
locked_page, unlock, 1,
- 1, 0, 0, 0);
+ 1, 0, 0, 0, 1);
disk_num_bytes -= cur_alloc_size;
num_bytes -= cur_alloc_size;
alloc_hint = ins.objectid + ins.offset;
@@ -854,7 +865,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
int limit = 10 * 1024 * 1042;
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
- EXTENT_DELALLOC, 1, 0, GFP_NOFS);
+ EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS);
while (start < end) {
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
async_cow->inode = inode;
@@ -1081,9 +1092,9 @@ out_check:
em->bdev = root->fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
while (1) {
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
@@ -1102,7 +1113,7 @@ out_check:
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
cur_offset, cur_offset + num_bytes - 1,
- locked_page, 1, 1, 1, 0, 0, 0);
+ locked_page, 1, 1, 1, 0, 0, 0, 1);
cur_offset = extent_end;
if (cur_offset > end)
break;
@@ -1148,6 +1159,83 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
return ret;
}
+static int btrfs_split_extent_hook(struct inode *inode,
+ struct extent_state *orig, u64 split)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 size;
+
+ if (!(orig->state & EXTENT_DELALLOC))
+ return 0;
+
+ size = orig->end - orig->start + 1;
+ if (size > root->fs_info->max_extent) {
+ u64 num_extents;
+ u64 new_size;
+
+ new_size = orig->end - split + 1;
+ num_extents = div64_u64(size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent);
+
+ /*
+ * if we break a large extent up then leave delalloc_extents be,
+ * since we've already accounted for the large extent.
+ */
+ if (div64_u64(new_size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent) < num_extents)
+ return 0;
+ }
+
+ BTRFS_I(inode)->delalloc_extents++;
+
+ return 0;
+}
+
+/*
+ * extent_io.c merge_extent_hook, used to track merged delayed allocation
+ * extents so we can keep track of new extents that are just merged onto old
+ * extents, such as when we are doing sequential writes, so we can properly
+ * account for the metadata space we'll need.
+ */
+static int btrfs_merge_extent_hook(struct inode *inode,
+ struct extent_state *new,
+ struct extent_state *other)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 new_size, old_size;
+ u64 num_extents;
+
+ /* not delalloc, ignore it */
+ if (!(other->state & EXTENT_DELALLOC))
+ return 0;
+
+ old_size = other->end - other->start + 1;
+ if (new->start < other->start)
+ new_size = other->end - new->start + 1;
+ else
+ new_size = new->end - other->start + 1;
+
+ /* we're not bigger than the max, unreserve the space and go */
+ if (new_size <= root->fs_info->max_extent) {
+ BTRFS_I(inode)->delalloc_extents--;
+ return 0;
+ }
+
+ /*
+ * If we grew by another max_extent, just return, we want to keep that
+ * reserved amount.
+ */
+ num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent);
+ if (div64_u64(new_size + root->fs_info->max_extent - 1,
+ root->fs_info->max_extent) > num_extents)
+ return 0;
+
+ BTRFS_I(inode)->delalloc_extents--;
+
+ return 0;
+}
+
/*
* extent_io.c set_bit_hook, used to track delayed allocation
* bytes in this file, and to maintain the list of inodes that
@@ -1156,6 +1244,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
unsigned long old, unsigned long bits)
{
+
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testeing for the DELALLOC
@@ -1163,6 +1252,8 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
*/
if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ BTRFS_I(inode)->delalloc_extents++;
btrfs_delalloc_reserve_space(root, inode, end - start + 1);
spin_lock(&root->fs_info->delalloc_lock);
BTRFS_I(inode)->delalloc_bytes += end - start + 1;
@@ -1179,22 +1270,27 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
/*
* extent_io.c clear_bit_hook, see set_bit_hook for why
*/
-static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
- unsigned long old, unsigned long bits)
+static int btrfs_clear_bit_hook(struct inode *inode,
+ struct extent_state *state, unsigned long bits)
{
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testeing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
- if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
+ BTRFS_I(inode)->delalloc_extents--;
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+
spin_lock(&root->fs_info->delalloc_lock);
- if (end - start + 1 > root->fs_info->delalloc_bytes) {
+ if (state->end - state->start + 1 >
+ root->fs_info->delalloc_bytes) {
printk(KERN_INFO "btrfs warning: delalloc account "
"%llu %llu\n",
- (unsigned long long)end - start + 1,
+ (unsigned long long)
+ state->end - state->start + 1,
(unsigned long long)
root->fs_info->delalloc_bytes);
btrfs_delalloc_free_space(root, inode, (u64)-1);
@@ -1202,9 +1298,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
BTRFS_I(inode)->delalloc_bytes = 0;
} else {
btrfs_delalloc_free_space(root, inode,
- end - start + 1);
- root->fs_info->delalloc_bytes -= end - start + 1;
- BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
+ state->end -
+ state->start + 1);
+ root->fs_info->delalloc_bytes -= state->end -
+ state->start + 1;
+ BTRFS_I(inode)->delalloc_bytes -= state->end -
+ state->start + 1;
}
if (BTRFS_I(inode)->delalloc_bytes == 0 &&
!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
@@ -1375,10 +1474,8 @@ again:
lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
/* already ordered? We're done */
- if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
- EXTENT_ORDERED, 0)) {
+ if (PagePrivate2(page))
goto out;
- }
ordered = btrfs_lookup_ordered_extent(inode, page_start);
if (ordered) {
@@ -1414,11 +1511,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
struct inode *inode = page->mapping->host;
struct btrfs_writepage_fixup *fixup;
struct btrfs_root *root = BTRFS_I(inode)->root;
- int ret;
- ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
- EXTENT_ORDERED, 0);
- if (ret)
+ /* this page is properly in the ordered list */
+ if (TestClearPagePrivate2(page))
return 0;
if (PageChecked(page))
@@ -1456,9 +1551,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
BUG_ON(!path);
path->leave_spinning = 1;
+
+ /*
+ * we may be replacing one extent in the tree with another.
+ * The new extent is pinned in the extent map, and we don't want
+ * to drop it from the cache until it is completely in the btree.
+ *
+ * So, tell btrfs_drop_extents to leave this extent in the cache.
+ * the caller is expected to unpin it and allow it to be merged
+ * with the others.
+ */
ret = btrfs_drop_extents(trans, root, inode, file_pos,
file_pos + num_bytes, locked_end,
- file_pos, &hint);
+ file_pos, &hint, 0);
BUG_ON(ret);
ins.objectid = inode->i_ino;
@@ -1486,7 +1591,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
inode_add_bytes(inode, num_bytes);
- btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
ins.objectid = disk_bytenr;
ins.offset = disk_num_bytes;
@@ -1597,6 +1701,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ordered_extent->len,
compressed, 0, 0,
BTRFS_FILE_EXTENT_REG);
+ unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+ ordered_extent->file_offset,
+ ordered_extent->len);
BUG_ON(ret);
}
unlock_extent(io_tree, ordered_extent->file_offset,
@@ -1624,6 +1731,7 @@ nocow:
static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate)
{
+ ClearPagePrivate2(page);
return btrfs_finish_ordered_io(page->mapping->host, start, end);
}
@@ -1670,13 +1778,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
failrec->last_mirror = 0;
failrec->bio_flags = 0;
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, failrec->len);
if (em->start > start || em->start + em->len < start) {
free_extent_map(em);
em = NULL;
}
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
if (!em || IS_ERR(em)) {
kfree(failrec);
@@ -1795,7 +1903,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
return 0;
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
- test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
+ test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
GFP_NOFS);
return 0;
@@ -2353,6 +2461,69 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
return ret;
}
+int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *dir, u64 objectid,
+ const char *name, int name_len)
+{
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_dir_item *di;
+ struct btrfs_key key;
+ u64 index;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+ name, name_len, -1);
+ BUG_ON(!di || IS_ERR(di));
+
+ leaf = path->nodes[0];
+ btrfs_dir_item_key_to_cpu(leaf, di, &key);
+ WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ BUG_ON(ret);
+ btrfs_release_path(root, path);
+
+ ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
+ objectid, root->root_key.objectid,
+ dir->i_ino, &index, name, name_len);
+ if (ret < 0) {
+ BUG_ON(ret != -ENOENT);
+ di = btrfs_search_dir_index_item(root, path, dir->i_ino,
+ name, name_len);
+ BUG_ON(!di || IS_ERR(di));
+
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_release_path(root, path);
+ index = key.offset;
+ }
+
+ di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+ index, name, name_len, -1);
+ BUG_ON(!di || IS_ERR(di));
+
+ leaf = path->nodes[0];
+ btrfs_dir_item_key_to_cpu(leaf, di, &key);
+ WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
+ ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ BUG_ON(ret);
+ btrfs_release_path(root, path);
+
+ btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ ret = btrfs_update_inode(trans, root, dir);
+ BUG_ON(ret);
+ dir->i_sb->s_dirt = 1;
+
+ btrfs_free_path(path);
+ return 0;
+}
+
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
@@ -2362,29 +2533,31 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
struct btrfs_trans_handle *trans;
unsigned long nr = 0;
- /*
- * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
- * the root of a subvolume or snapshot
- */
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
- inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+ inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
return -ENOTEMPTY;
- }
trans = btrfs_start_transaction(root, 1);
btrfs_set_trans_block_group(trans, dir);
+ if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+ err = btrfs_unlink_subvol(trans, root, dir,
+ BTRFS_I(inode)->location.objectid,
+ dentry->d_name.name,
+ dentry->d_name.len);
+ goto out;
+ }
+
err = btrfs_orphan_add(trans, inode);
if (err)
- goto fail_trans;
+ goto out;
/* now the directory is empty */
err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
dentry->d_name.name, dentry->d_name.len);
if (!err)
btrfs_i_size_write(inode, 0);
-
-fail_trans:
+out:
nr = trans->blocks_used;
ret = btrfs_end_transaction_throttle(trans, root);
btrfs_btree_balance_dirty(root, nr);
@@ -2604,8 +2777,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
if (root->ref_cows)
btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
path = btrfs_alloc_path();
- path->reada = -1;
BUG_ON(!path);
+ path->reada = -1;
/* FIXME, add redo link to tree so we don't leak on crash */
key.objectid = inode->i_ino;
@@ -2865,7 +3038,12 @@ again:
goto again;
}
- btrfs_set_extent_delalloc(inode, page_start, page_end);
+ ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+ if (ret) {
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ goto out_unlock;
+ }
+
ret = 0;
if (offset != PAGE_CACHE_SIZE) {
kaddr = kmap(page);
@@ -2896,15 +3074,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
u64 last_byte;
u64 cur_offset;
u64 hole_size;
- int err;
+ int err = 0;
if (size <= hole_start)
return 0;
- err = btrfs_check_metadata_free_space(root);
- if (err)
- return err;
-
btrfs_truncate_page(inode->i_mapping, inode->i_size);
while (1) {
@@ -2936,15 +3110,21 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
cur_offset,
cur_offset + hole_size,
block_end,
- cur_offset, &hint_byte);
+ cur_offset, &hint_byte, 1);
if (err)
break;
+
+ err = btrfs_reserve_metadata_space(root, 1);
+ if (err)
+ break;
+
err = btrfs_insert_file_extent(trans, root,
inode->i_ino, cur_offset, 0,
0, hole_size, 0, hole_size,
0, 0, 0);
btrfs_drop_extent_cache(inode, hole_start,
last_byte - 1, 0);
+ btrfs_unreserve_metadata_space(root, 1);
}
free_extent_map(em);
cur_offset = last_byte;
@@ -3004,6 +3184,11 @@ void btrfs_delete_inode(struct inode *inode)
}
btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ if (inode->i_nlink > 0) {
+ BUG_ON(btrfs_root_refs(&root->root_item) != 0);
+ goto no_delete;
+ }
+
btrfs_i_size_write(inode, 0);
trans = btrfs_join_transaction(root, 1);
@@ -3071,37 +3256,81 @@ out_err:
* is kind of like crossing a mount point.
*/
static int fixup_tree_root_location(struct btrfs_root *root,
- struct btrfs_key *location,
- struct btrfs_root **sub_root,
- struct dentry *dentry)
+ struct inode *dir,
+ struct dentry *dentry,
+ struct btrfs_key *location,
+ struct btrfs_root **sub_root)
{
- struct btrfs_root_item *ri;
+ struct btrfs_path *path;
+ struct btrfs_root *new_root;
+ struct btrfs_root_ref *ref;
+ struct extent_buffer *leaf;
+ int ret;
+ int err = 0;
- if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
- return 0;
- if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
- return 0;
+ path = btrfs_alloc_path();
+ if (!path) {
+ err = -ENOMEM;
+ goto out;
+ }
- *sub_root = btrfs_read_fs_root(root->fs_info, location,
- dentry->d_name.name,
- dentry->d_name.len);
- if (IS_ERR(*sub_root))
- return PTR_ERR(*sub_root);
+ err = -ENOENT;
+ ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
+ BTRFS_I(dir)->root->root_key.objectid,
+ location->objectid);
+ if (ret) {
+ if (ret < 0)
+ err = ret;
+ goto out;
+ }
- ri = &(*sub_root)->root_item;
- location->objectid = btrfs_root_dirid(ri);
- btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
- location->offset = 0;
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
+ if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
+ btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
+ goto out;
- return 0;
+ ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
+ (unsigned long)(ref + 1),
+ dentry->d_name.len);
+ if (ret)
+ goto out;
+
+ btrfs_release_path(root->fs_info->tree_root, path);
+
+ new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
+ if (IS_ERR(new_root)) {
+ err = PTR_ERR(new_root);
+ goto out;
+ }
+
+ if (btrfs_root_refs(&new_root->root_item) == 0) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ *sub_root = new_root;
+ location->objectid = btrfs_root_dirid(&new_root->root_item);
+ location->type = BTRFS_INODE_ITEM_KEY;
+ location->offset = 0;
+ err = 0;
+out:
+ btrfs_free_path(path);
+ return err;
}
static void inode_tree_add(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_inode *entry;
- struct rb_node **p = &root->inode_tree.rb_node;
- struct rb_node *parent = NULL;
+ struct rb_node **p;
+ struct rb_node *parent;
+again:
+ p = &root->inode_tree.rb_node;
+ parent = NULL;
+
+ if (hlist_unhashed(&inode->i_hash))
+ return;
spin_lock(&root->inode_lock);
while (*p) {
@@ -3109,13 +3338,16 @@ static void inode_tree_add(struct inode *inode)
entry = rb_entry(parent, struct btrfs_inode, rb_node);
if (inode->i_ino < entry->vfs_inode.i_ino)
- p = &(*p)->rb_left;
+ p = &parent->rb_left;
else if (inode->i_ino > entry->vfs_inode.i_ino)
- p = &(*p)->rb_right;
+ p = &parent->rb_right;
else {
WARN_ON(!(entry->vfs_inode.i_state &
(I_WILL_FREE | I_FREEING | I_CLEAR)));
- break;
+ rb_erase(parent, &root->inode_tree);
+ RB_CLEAR_NODE(parent);
+ spin_unlock(&root->inode_lock);
+ goto again;
}
}
rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
@@ -3126,15 +3358,89 @@ static void inode_tree_add(struct inode *inode)
static void inode_tree_del(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ int empty = 0;
+ spin_lock(&root->inode_lock);
if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
- spin_lock(&root->inode_lock);
rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
- spin_unlock(&root->inode_lock);
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+ empty = RB_EMPTY_ROOT(&root->inode_tree);
+ }
+ spin_unlock(&root->inode_lock);
+
+ if (empty && btrfs_root_refs(&root->root_item) == 0) {
+ synchronize_srcu(&root->fs_info->subvol_srcu);
+ spin_lock(&root->inode_lock);
+ empty = RB_EMPTY_ROOT(&root->inode_tree);
+ spin_unlock(&root->inode_lock);
+ if (empty)
+ btrfs_add_dead_root(root);
}
}
+int btrfs_invalidate_inodes(struct btrfs_root *root)
+{
+ struct rb_node *node;
+ struct rb_node *prev;
+ struct btrfs_inode *entry;
+ struct inode *inode;
+ u64 objectid = 0;
+
+ WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+
+ spin_lock(&root->inode_lock);
+again:
+ node = root->inode_tree.rb_node;
+ prev = NULL;
+ while (node) {
+ prev = node;
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+ if (objectid < entry->vfs_inode.i_ino)
+ node = node->rb_left;
+ else if (objectid > entry->vfs_inode.i_ino)
+ node = node->rb_right;
+ else
+ break;
+ }
+ if (!node) {
+ while (prev) {
+ entry = rb_entry(prev, struct btrfs_inode, rb_node);
+ if (objectid <= entry->vfs_inode.i_ino) {
+ node = prev;
+ break;
+ }
+ prev = rb_next(prev);
+ }
+ }
+ while (node) {
+ entry = rb_entry(node, struct btrfs_inode, rb_node);
+ objectid = entry->vfs_inode.i_ino + 1;
+ inode = igrab(&entry->vfs_inode);
+ if (inode) {
+ spin_unlock(&root->inode_lock);
+ if (atomic_read(&inode->i_count) > 1)
+ d_prune_aliases(inode);
+ /*
+ * btrfs_drop_inode will remove it from
+ * the inode cache when its usage count
+ * hits zero.
+ */
+ iput(inode);
+ cond_resched();
+ spin_lock(&root->inode_lock);
+ goto again;
+ }
+
+ if (cond_resched_lock(&root->inode_lock))
+ goto again;
+
+ node = rb_next(node);
+ }
+ spin_unlock(&root->inode_lock);
+ return 0;
+}
+
static noinline void init_btrfs_i(struct inode *inode)
{
struct btrfs_inode *bi = BTRFS_I(inode);
@@ -3219,15 +3525,41 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
return inode;
}
+static struct inode *new_simple_dir(struct super_block *s,
+ struct btrfs_key *key,
+ struct btrfs_root *root)
+{
+ struct inode *inode = new_inode(s);
+
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ init_btrfs_i(inode);
+
+ BTRFS_I(inode)->root = root;
+ memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
+ BTRFS_I(inode)->dummy_inode = 1;
+
+ inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
+ inode->i_op = &simple_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+ inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+ return inode;
+}
+
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
{
struct inode *inode;
- struct btrfs_inode *bi = BTRFS_I(dir);
- struct btrfs_root *root = bi->root;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *sub_root = root;
struct btrfs_key location;
+ int index;
int ret;
+ dentry->d_op = &btrfs_dentry_operations;
+
if (dentry->d_name.len > BTRFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
@@ -3236,29 +3568,50 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (ret < 0)
return ERR_PTR(ret);
- inode = NULL;
- if (location.objectid) {
- ret = fixup_tree_root_location(root, &location, &sub_root,
- dentry);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
- return ERR_PTR(-ENOENT);
+ if (location.objectid == 0)
+ return NULL;
+
+ if (location.type == BTRFS_INODE_ITEM_KEY) {
+ inode = btrfs_iget(dir->i_sb, &location, root);
+ return inode;
+ }
+
+ BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
+
+ index = srcu_read_lock(&root->fs_info->subvol_srcu);
+ ret = fixup_tree_root_location(root, dir, dentry,
+ &location, &sub_root);
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ inode = ERR_PTR(ret);
+ else
+ inode = new_simple_dir(dir->i_sb, &location, sub_root);
+ } else {
inode = btrfs_iget(dir->i_sb, &location, sub_root);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
}
+ srcu_read_unlock(&root->fs_info->subvol_srcu, index);
+
return inode;
}
+static int btrfs_dentry_delete(struct dentry *dentry)
+{
+ struct btrfs_root *root;
+
+ if (!dentry->d_inode)
+ return 0;
+
+ root = BTRFS_I(dentry->d_inode)->root;
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return 1;
+ return 0;
+}
+
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
struct nameidata *nd)
{
struct inode *inode;
- if (dentry->d_name.len > BTRFS_NAME_LEN)
- return ERR_PTR(-ENAMETOOLONG);
-
inode = btrfs_lookup_dentry(dir, dentry);
if (IS_ERR(inode))
return ERR_CAST(inode);
@@ -3580,12 +3933,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
owner = 1;
BTRFS_I(inode)->block_group =
btrfs_find_block_group(root, 0, alloc_hint, owner);
- if ((mode & S_IFREG)) {
- if (btrfs_test_opt(root, NODATASUM))
- BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
- if (btrfs_test_opt(root, NODATACOW))
- BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
- }
key[0].objectid = objectid;
btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
@@ -3603,9 +3950,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
if (ret != 0)
goto fail;
- if (objectid > root->highest_inode)
- root->highest_inode = objectid;
-
inode->i_uid = current_fsuid();
if (dir && (dir->i_mode & S_ISGID)) {
@@ -3640,6 +3984,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
btrfs_inherit_iflags(inode, dir);
+ if ((mode & S_IFREG)) {
+ if (btrfs_test_opt(root, NODATASUM))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+ if (btrfs_test_opt(root, NODATACOW))
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+ }
+
insert_inode_hash(inode);
inode_tree_add(inode);
return inode;
@@ -3666,26 +4017,35 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
struct inode *parent_inode, struct inode *inode,
const char *name, int name_len, int add_backref, u64 index)
{
- int ret;
+ int ret = 0;
struct btrfs_key key;
struct btrfs_root *root = BTRFS_I(parent_inode)->root;
- key.objectid = inode->i_ino;
- btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
- key.offset = 0;
+ if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
+ } else {
+ key.objectid = inode->i_ino;
+ btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+ key.offset = 0;
+ }
+
+ if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
+ key.objectid, root->root_key.objectid,
+ parent_inode->i_ino,
+ index, name, name_len);
+ } else if (add_backref) {
+ ret = btrfs_insert_inode_ref(trans, root,
+ name, name_len, inode->i_ino,
+ parent_inode->i_ino, index);
+ }
- ret = btrfs_insert_dir_item(trans, root, name, name_len,
- parent_inode->i_ino,
- &key, btrfs_inode_type(inode),
- index);
if (ret == 0) {
- if (add_backref) {
- ret = btrfs_insert_inode_ref(trans, root,
- name, name_len,
- inode->i_ino,
- parent_inode->i_ino,
- index);
- }
+ ret = btrfs_insert_dir_item(trans, root, name, name_len,
+ parent_inode->i_ino, &key,
+ btrfs_inode_type(inode), index);
+ BUG_ON(ret);
+
btrfs_i_size_write(parent_inode, parent_inode->i_size +
name_len * 2);
parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
@@ -3725,11 +4085,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
if (!new_valid_dev(rdev))
return -EINVAL;
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 2 for inode item and ref
+ * 2 for dir items
+ * 1 for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
if (err)
- goto fail;
+ return err;
trans = btrfs_start_transaction(root, 1);
+ if (!trans)
+ goto fail;
btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -3767,6 +4134,7 @@ out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
fail:
+ btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -3787,10 +4155,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
u64 objectid;
u64 index = 0;
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 2 for inode item and ref
+ * 2 for dir items
+ * 1 for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
if (err)
- goto fail;
+ return err;
+
trans = btrfs_start_transaction(root, 1);
+ if (!trans)
+ goto fail;
btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -3831,6 +4207,7 @@ out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
fail:
+ btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -3853,10 +4230,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (inode->i_nlink == 0)
return -ENOENT;
- btrfs_inc_nlink(inode);
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 1 item for inode ref
+ * 2 items for dir items
+ */
+ err = btrfs_reserve_metadata_space(root, 3);
if (err)
- goto fail;
+ return err;
+
+ btrfs_inc_nlink(inode);
+
err = btrfs_set_inode_index(dir, &index);
if (err)
goto fail;
@@ -3868,20 +4251,19 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
err = btrfs_add_nondir(trans, dentry, inode, 1, index);
- if (err)
- drop_inode = 1;
-
- btrfs_update_inode_block_group(trans, dir);
- err = btrfs_update_inode(trans, root, inode);
-
- if (err)
+ if (err) {
drop_inode = 1;
+ } else {
+ btrfs_update_inode_block_group(trans, dir);
+ err = btrfs_update_inode(trans, root, inode);
+ BUG_ON(err);
+ btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
+ }
nr = trans->blocks_used;
-
- btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
btrfs_end_transaction_throttle(trans, root);
fail:
+ btrfs_unreserve_metadata_space(root, 3);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -3901,17 +4283,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
u64 index = 0;
unsigned long nr = 1;
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 2 items for inode and ref
+ * 2 items for dir items
+ * 1 for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
if (err)
- goto out_unlock;
+ return err;
trans = btrfs_start_transaction(root, 1);
- btrfs_set_trans_block_group(trans, dir);
-
- if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ if (!trans) {
+ err = -ENOMEM;
goto out_unlock;
}
+ btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
if (err) {
@@ -3960,6 +4346,7 @@ out_fail:
btrfs_end_transaction_throttle(trans, root);
out_unlock:
+ btrfs_unreserve_metadata_space(root, 5);
if (drop_on_err)
iput(inode);
btrfs_btree_balance_dirty(root, nr);
@@ -4057,11 +4444,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
int compressed;
again:
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (em)
em->bdev = root->fs_info->fs_devices->latest_bdev;
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
if (em) {
if (em->start > start || em->start + em->len <= start)
@@ -4208,6 +4595,11 @@ again:
map = kmap(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
copy_size);
+ if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
+ memset(map + pg_offset + copy_size, 0,
+ PAGE_CACHE_SIZE - pg_offset -
+ copy_size);
+ }
kunmap(page);
}
flush_dcache_page(page);
@@ -4252,7 +4644,7 @@ insert:
}
err = 0;
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
/* it is possible that someone inserted the extent into the tree
* while we had the lock dropped. It is also possible that
@@ -4292,7 +4684,7 @@ insert:
err = 0;
}
}
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
out:
if (path)
btrfs_free_path(path);
@@ -4391,13 +4783,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+ /*
+ * we have the page locked, so new writeback can't start,
+ * and the dirty bit won't be cleared while we are here.
+ *
+ * Wait for IO on this page so that we can safely clear
+ * the PagePrivate2 bit and do ordered accounting
+ */
wait_on_page_writeback(page);
+
tree = &BTRFS_I(page->mapping->host)->io_tree;
if (offset) {
btrfs_releasepage(page, GFP_NOFS);
return;
}
-
lock_extent(tree, page_start, page_end, GFP_NOFS);
ordered = btrfs_lookup_ordered_extent(page->mapping->host,
page_offset(page));
@@ -4408,16 +4808,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
*/
clear_extent_bit(tree, page_start, page_end,
EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_LOCKED, 1, 0, GFP_NOFS);
- btrfs_finish_ordered_io(page->mapping->host,
- page_start, page_end);
+ EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
+ /*
+ * whoever cleared the private bit is responsible
+ * for the finish_ordered_io
+ */
+ if (TestClearPagePrivate2(page)) {
+ btrfs_finish_ordered_io(page->mapping->host,
+ page_start, page_end);
+ }
btrfs_put_ordered_extent(ordered);
lock_extent(tree, page_start, page_end, GFP_NOFS);
}
clear_extent_bit(tree, page_start, page_end,
- EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_ORDERED,
- 1, 1, GFP_NOFS);
+ EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
+ 1, 1, NULL, GFP_NOFS);
__btrfs_releasepage(page, GFP_NOFS);
ClearPageChecked(page);
@@ -4466,6 +4871,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
goto out;
}
+ ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
+ if (ret) {
+ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
again:
lock_page(page);
@@ -4497,7 +4909,23 @@ again:
goto again;
}
- btrfs_set_extent_delalloc(inode, page_start, page_end);
+ /*
+ * XXX - page_mkwrite gets called every time the page is dirtied, even
+ * if it was already dirty, so for space accounting reasons we need to
+ * clear any delalloc bits for the range we are fixing to save. There
+ * is probably a better way to do this, but for now keep consistent with
+ * prepare_pages in the normal write path.
+ */
+ clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ EXTENT_DIRTY | EXTENT_DELALLOC, GFP_NOFS);
+
+ ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
+ if (ret) {
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ ret = VM_FAULT_SIGBUS;
+ btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+ goto out_unlock;
+ }
ret = 0;
/* page is wholly or partially inside EOF */
@@ -4514,11 +4942,15 @@ again:
}
ClearPageChecked(page);
set_page_dirty(page);
+ SetPageUptodate(page);
BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
out_unlock:
+ btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+ if (!ret)
+ return VM_FAULT_LOCKED;
unlock_page(page);
out:
return ret;
@@ -4587,11 +5019,11 @@ out:
* create a new subvolume directory/inode (helper for the ioctl).
*/
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *new_root, struct dentry *dentry,
+ struct btrfs_root *new_root,
u64 new_dirid, u64 alloc_hint)
{
struct inode *inode;
- int error;
+ int err;
u64 index = 0;
inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
@@ -4604,11 +5036,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
inode->i_nlink = 1;
btrfs_i_size_write(inode, 0);
- error = btrfs_update_inode(trans, new_root, inode);
- if (error)
- return error;
+ err = btrfs_update_inode(trans, new_root, inode);
+ BUG_ON(err);
- d_instantiate(dentry, inode);
+ iput(inode);
return 0;
}
@@ -4634,6 +5065,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
return NULL;
ei->last_trans = 0;
ei->logged_trans = 0;
+ ei->delalloc_extents = 0;
+ ei->delalloc_reserved_extents = 0;
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->i_orphan);
INIT_LIST_HEAD(&ei->ordered_operations);
@@ -4686,6 +5119,16 @@ void btrfs_destroy_inode(struct inode *inode)
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
+void btrfs_drop_inode(struct inode *inode)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
+ generic_delete_inode(inode);
+ else
+ generic_drop_inode(inode);
+}
+
static void init_once(void *foo)
{
struct btrfs_inode *ei = (struct btrfs_inode *) foo;
@@ -4754,44 +5197,81 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
+ struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
struct timespec ctime = CURRENT_TIME;
u64 index = 0;
+ u64 root_objectid;
int ret;
- /* we're not allowed to rename between subvolumes */
- if (BTRFS_I(old_inode)->root->root_key.objectid !=
- BTRFS_I(new_dir)->root->root_key.objectid)
+ if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+ return -EPERM;
+
+ /* we only allow rename subvolume link between subvolumes */
+ if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
return -EXDEV;
+ if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
+ (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
+ return -ENOTEMPTY;
+
if (S_ISDIR(old_inode->i_mode) && new_inode &&
- new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
+ new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
- }
- /* to rename a snapshot or subvolume, we need to juggle the
- * backrefs. This isn't coded yet
+ /*
+ * 2 items for dir items
+ * 1 item for orphan entry
+ * 1 item for ref
*/
- if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
- return -EXDEV;
-
- ret = btrfs_check_metadata_free_space(root);
+ ret = btrfs_reserve_metadata_space(root, 4);
if (ret)
- goto out_unlock;
+ return ret;
/*
* we're using rename to replace one file with another.
* and the replacement file is large. Start IO on it now so
* we don't add too much work to the end of the transaction
*/
- if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
- new_inode->i_size &&
+ if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
filemap_flush(old_inode->i_mapping);
+ /* close the racy window with snapshot create/destroy ioctl */
+ if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+ down_read(&root->fs_info->subvol_sem);
+
trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, new_dir);
+
+ if (dest != root)
+ btrfs_record_root_in_trans(trans, dest);
+ ret = btrfs_set_inode_index(new_dir, &index);
+ if (ret)
+ goto out_fail;
+
+ if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ /* force full log commit if subvolume involved. */
+ root->fs_info->last_trans_log_full_commit = trans->transid;
+ } else {
+ ret = btrfs_insert_inode_ref(trans, dest,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len,
+ old_inode->i_ino,
+ new_dir->i_ino, index);
+ if (ret)
+ goto out_fail;
+ /*
+ * this is an ugly little race, but the rename is required
+ * to make sure that if we crash, the inode is either at the
+ * old name or the new one. pinning the log transaction lets
+ * us make sure we don't allow a log commit to come in after
+ * we unlink the name but before we add the new name back in.
+ */
+ btrfs_pin_log_trans(root);
+ }
/*
* make sure the inode gets flushed if it is replacing
* something.
@@ -4801,18 +5281,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
btrfs_add_ordered_operation(trans, root, old_inode);
}
- /*
- * this is an ugly little race, but the rename is required to make
- * sure that if we crash, the inode is either at the old name
- * or the new one. pinning the log transaction lets us make sure
- * we don't allow a log commit to come in after we unlink the
- * name but before we add the new name back in.
- */
- btrfs_pin_log_trans(root);
-
- btrfs_set_trans_block_group(trans, new_dir);
-
- btrfs_inc_nlink(old_dentry->d_inode);
old_dir->i_ctime = old_dir->i_mtime = ctime;
new_dir->i_ctime = new_dir->i_mtime = ctime;
old_inode->i_ctime = ctime;
@@ -4820,47 +5288,60 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
- ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
- old_dentry->d_name.name,
- old_dentry->d_name.len);
- if (ret)
- goto out_fail;
+ if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+ root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+ ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len);
+ } else {
+ btrfs_inc_nlink(old_dentry->d_inode);
+ ret = btrfs_unlink_inode(trans, root, old_dir,
+ old_dentry->d_inode,
+ old_dentry->d_name.name,
+ old_dentry->d_name.len);
+ }
+ BUG_ON(ret);
if (new_inode) {
new_inode->i_ctime = CURRENT_TIME;
- ret = btrfs_unlink_inode(trans, root, new_dir,
- new_dentry->d_inode,
- new_dentry->d_name.name,
- new_dentry->d_name.len);
- if (ret)
- goto out_fail;
+ if (unlikely(new_inode->i_ino ==
+ BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+ root_objectid = BTRFS_I(new_inode)->location.objectid;
+ ret = btrfs_unlink_subvol(trans, dest, new_dir,
+ root_objectid,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+ BUG_ON(new_inode->i_nlink == 0);
+ } else {
+ ret = btrfs_unlink_inode(trans, dest, new_dir,
+ new_dentry->d_inode,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len);
+ }
+ BUG_ON(ret);
if (new_inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans, new_dentry->d_inode);
- if (ret)
- goto out_fail;
+ BUG_ON(ret);
}
-
}
- ret = btrfs_set_inode_index(new_dir, &index);
- if (ret)
- goto out_fail;
- ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
- old_inode, new_dentry->d_name.name,
- new_dentry->d_name.len, 1, index);
- if (ret)
- goto out_fail;
+ ret = btrfs_add_link(trans, new_dir, old_inode,
+ new_dentry->d_name.name,
+ new_dentry->d_name.len, 0, index);
+ BUG_ON(ret);
- btrfs_log_new_name(trans, old_inode, old_dir,
- new_dentry->d_parent);
+ if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_log_new_name(trans, old_inode, old_dir,
+ new_dentry->d_parent);
+ btrfs_end_log_trans(root);
+ }
out_fail:
-
- /* this btrfs_end_log_trans just allows the current
- * log-sub transaction to complete
- */
- btrfs_end_log_trans(root);
btrfs_end_transaction_throttle(trans, root);
-out_unlock:
+
+ if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+ up_read(&root->fs_info->subvol_sem);
+
+ btrfs_unreserve_metadata_space(root, 4);
return ret;
}
@@ -4932,11 +5413,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
return -ENAMETOOLONG;
- err = btrfs_check_metadata_free_space(root);
+ /*
+ * 2 items for inode item and ref
+ * 2 items for dir items
+ * 1 item for xattr if selinux is on
+ */
+ err = btrfs_reserve_metadata_space(root, 5);
if (err)
- goto out_fail;
+ return err;
trans = btrfs_start_transaction(root, 1);
+ if (!trans)
+ goto out_fail;
btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
@@ -5017,6 +5505,7 @@ out_unlock:
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
out_fail:
+ btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
@@ -5038,6 +5527,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
while (num_bytes > 0) {
alloc_size = min(num_bytes, root->fs_info->max_extent);
+
+ ret = btrfs_reserve_metadata_space(root, 1);
+ if (ret)
+ goto out;
+
ret = btrfs_reserve_extent(trans, root, alloc_size,
root->sectorsize, 0, alloc_hint,
(u64)-1, &ins, 1);
@@ -5052,9 +5546,12 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
0, 0, 0,
BTRFS_FILE_EXTENT_PREALLOC);
BUG_ON(ret);
+ btrfs_drop_extent_cache(inode, cur_offset,
+ cur_offset + ins.offset -1, 0);
num_bytes -= ins.offset;
cur_offset += ins.offset;
alloc_hint = ins.objectid + ins.offset;
+ btrfs_unreserve_metadata_space(root, 1);
}
out:
if (cur_offset > start) {
@@ -5082,6 +5579,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
struct extent_map *em;
struct btrfs_trans_handle *trans;
+ struct btrfs_root *root;
int ret;
alloc_start = offset & ~mask;
@@ -5100,6 +5598,13 @@ static long btrfs_fallocate(struct inode *inode, int mode,
goto out;
}
+ root = BTRFS_I(inode)->root;
+
+ ret = btrfs_check_data_free_space(root, inode,
+ alloc_end - alloc_start);
+ if (ret)
+ goto out;
+
locked_end = alloc_end - 1;
while (1) {
struct btrfs_ordered_extent *ordered;
@@ -5107,7 +5612,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
if (!trans) {
ret = -EIO;
- goto out;
+ goto out_free;
}
/* the extent lock is ordered inside the running
@@ -5168,6 +5673,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
GFP_NOFS);
btrfs_end_transaction(trans, BTRFS_I(inode)->root);
+out_free:
+ btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start);
out:
mutex_unlock(&inode->i_mutex);
return ret;
@@ -5185,7 +5692,7 @@ static int btrfs_permission(struct inode *inode, int mask)
return generic_permission(inode, mask, btrfs_check_acl);
}
-static struct inode_operations btrfs_dir_inode_operations = {
+static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
.create = btrfs_create,
@@ -5203,11 +5710,12 @@ static struct inode_operations btrfs_dir_inode_operations = {
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
};
-static struct inode_operations btrfs_dir_ro_inode_operations = {
+static const struct inode_operations btrfs_dir_ro_inode_operations = {
.lookup = btrfs_lookup,
.permission = btrfs_permission,
};
-static struct file_operations btrfs_dir_file_operations = {
+
+static const struct file_operations btrfs_dir_file_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = btrfs_real_readdir,
@@ -5229,6 +5737,8 @@ static struct extent_io_ops btrfs_extent_io_ops = {
.readpage_io_failed_hook = btrfs_io_failed_hook,
.set_bit_hook = btrfs_set_bit_hook,
.clear_bit_hook = btrfs_clear_bit_hook,
+ .merge_extent_hook = btrfs_merge_extent_hook,
+ .split_extent_hook = btrfs_split_extent_hook,
};
/*
@@ -5243,7 +5753,7 @@ static struct extent_io_ops btrfs_extent_io_ops = {
*
* For now we're avoiding this by dropping bmap.
*/
-static struct address_space_operations btrfs_aops = {
+static const struct address_space_operations btrfs_aops = {
.readpage = btrfs_readpage,
.writepage = btrfs_writepage,
.writepages = btrfs_writepages,
@@ -5253,16 +5763,17 @@ static struct address_space_operations btrfs_aops = {
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
.set_page_dirty = btrfs_set_page_dirty,
+ .error_remove_page = generic_error_remove_page,
};
-static struct address_space_operations btrfs_symlink_aops = {
+static const struct address_space_operations btrfs_symlink_aops = {
.readpage = btrfs_readpage,
.writepage = btrfs_writepage,
.invalidatepage = btrfs_invalidatepage,
.releasepage = btrfs_releasepage,
};
-static struct inode_operations btrfs_file_inode_operations = {
+static const struct inode_operations btrfs_file_inode_operations = {
.truncate = btrfs_truncate,
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
@@ -5274,7 +5785,7 @@ static struct inode_operations btrfs_file_inode_operations = {
.fallocate = btrfs_fallocate,
.fiemap = btrfs_fiemap,
};
-static struct inode_operations btrfs_special_inode_operations = {
+static const struct inode_operations btrfs_special_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
.permission = btrfs_permission,
@@ -5283,7 +5794,7 @@ static struct inode_operations btrfs_special_inode_operations = {
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
};
-static struct inode_operations btrfs_symlink_inode_operations = {
+static const struct inode_operations btrfs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
@@ -5293,3 +5804,7 @@ static struct inode_operations btrfs_symlink_inode_operations = {
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
};
+
+struct dentry_operations btrfs_dentry_operations = {
+ .d_delete = btrfs_dentry_delete,
+};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index eff18f5b5362..9a780c8d0ac8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -27,7 +27,6 @@
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
-#include <linux/smp_lock.h>
#include <linux/backing-dev.h>
#include <linux/mount.h>
#include <linux/mpage.h>
@@ -231,8 +230,8 @@ static noinline int create_subvol(struct btrfs_root *root,
struct btrfs_root_item root_item;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
- struct btrfs_root *new_root = root;
- struct inode *dir;
+ struct btrfs_root *new_root;
+ struct inode *dir = dentry->d_parent->d_inode;
int ret;
int err;
u64 objectid;
@@ -240,9 +239,15 @@ static noinline int create_subvol(struct btrfs_root *root,
u64 index = 0;
unsigned long nr = 1;
- ret = btrfs_check_metadata_free_space(root);
+ /*
+ * 1 - inode item
+ * 2 - refs
+ * 1 - root item
+ * 2 - dir items
+ */
+ ret = btrfs_reserve_metadata_space(root, 6);
if (ret)
- goto fail_commit;
+ return ret;
trans = btrfs_start_transaction(root, 1);
BUG_ON(!trans);
@@ -305,11 +310,17 @@ static noinline int create_subvol(struct btrfs_root *root,
if (ret)
goto fail;
+ key.offset = (u64)-1;
+ new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
+ BUG_ON(IS_ERR(new_root));
+
+ btrfs_record_root_in_trans(trans, new_root);
+
+ ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
+ BTRFS_I(dir)->block_group);
/*
* insert the directory item
*/
- key.offset = (u64)-1;
- dir = dentry->d_parent->d_inode;
ret = btrfs_set_inode_index(dir, &index);
BUG_ON(ret);
@@ -323,43 +334,20 @@ static noinline int create_subvol(struct btrfs_root *root,
ret = btrfs_update_inode(trans, root, dir);
BUG_ON(ret);
- /* add the backref first */
ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
- objectid, BTRFS_ROOT_BACKREF_KEY,
- root->root_key.objectid,
+ objectid, root->root_key.objectid,
dir->i_ino, index, name, namelen);
BUG_ON(ret);
- /* now add the forward ref */
- ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
- root->root_key.objectid, BTRFS_ROOT_REF_KEY,
- objectid,
- dir->i_ino, index, name, namelen);
-
- BUG_ON(ret);
-
- ret = btrfs_commit_transaction(trans, root);
- if (ret)
- goto fail_commit;
-
- new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
- BUG_ON(!new_root);
-
- trans = btrfs_start_transaction(new_root, 1);
- BUG_ON(!trans);
-
- ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
- BTRFS_I(dir)->block_group);
- if (ret)
- goto fail;
-
+ d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
fail:
nr = trans->blocks_used;
- err = btrfs_commit_transaction(trans, new_root);
+ err = btrfs_commit_transaction(trans, root);
if (err && !ret)
ret = err;
-fail_commit:
+
+ btrfs_unreserve_metadata_space(root, 6);
btrfs_btree_balance_dirty(root, nr);
return ret;
}
@@ -376,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
if (!root->ref_cows)
return -EINVAL;
- ret = btrfs_check_metadata_free_space(root);
+ /*
+ * 1 - inode item
+ * 2 - refs
+ * 1 - root item
+ * 2 - dir items
+ */
+ ret = btrfs_reserve_metadata_space(root, 6);
if (ret)
goto fail_unlock;
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
if (!pending_snapshot) {
ret = -ENOMEM;
+ btrfs_unreserve_metadata_space(root, 6);
goto fail_unlock;
}
pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
if (!pending_snapshot->name) {
ret = -ENOMEM;
kfree(pending_snapshot);
+ btrfs_unreserve_metadata_space(root, 6);
goto fail_unlock;
}
memcpy(pending_snapshot->name, name, namelen);
@@ -421,14 +417,15 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
* sys_mkdirat and vfs_mkdir, but we only do a single component lookup
* inside this filesystem so it's quite a bit simpler.
*/
-static noinline int btrfs_mksubvol(struct path *parent, char *name,
- int mode, int namelen,
+static noinline int btrfs_mksubvol(struct path *parent,
+ char *name, int namelen,
struct btrfs_root *snap_src)
{
+ struct inode *dir = parent->dentry->d_inode;
struct dentry *dentry;
int error;
- mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
dentry = lookup_one_len(name, parent->dentry, namelen);
error = PTR_ERR(dentry);
@@ -439,99 +436,39 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
if (dentry->d_inode)
goto out_dput;
- if (!IS_POSIXACL(parent->dentry->d_inode))
- mode &= ~current_umask();
-
error = mnt_want_write(parent->mnt);
if (error)
goto out_dput;
- error = btrfs_may_create(parent->dentry->d_inode, dentry);
+ error = btrfs_may_create(dir, dentry);
if (error)
goto out_drop_write;
- /*
- * Actually perform the low-level subvolume creation after all
- * this VFS fuzz.
- *
- * Eventually we want to pass in an inode under which we create this
- * subvolume, but for now all are under the filesystem root.
- *
- * Also we should pass on the mode eventually to allow creating new
- * subvolume with specific mode bits.
- */
+ down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
+
+ if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
+ goto out_up_read;
+
if (snap_src) {
- struct dentry *dir = dentry->d_parent;
- struct dentry *test = dir->d_parent;
- struct btrfs_path *path = btrfs_alloc_path();
- int ret;
- u64 test_oid;
- u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
-
- test_oid = snap_src->root_key.objectid;
-
- ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
- path, parent_oid, test_oid);
- if (ret == 0)
- goto create;
- btrfs_release_path(snap_src->fs_info->tree_root, path);
-
- /* we need to make sure we aren't creating a directory loop
- * by taking a snapshot of something that has our current
- * subvol in its directory tree. So, this loops through
- * the dentries and checks the forward refs for each subvolume
- * to see if is references the subvolume where we are
- * placing this new snapshot.
- */
- while (1) {
- if (!test ||
- dir == snap_src->fs_info->sb->s_root ||
- test == snap_src->fs_info->sb->s_root ||
- test->d_inode->i_sb != snap_src->fs_info->sb) {
- break;
- }
- if (S_ISLNK(test->d_inode->i_mode)) {
- printk(KERN_INFO "Btrfs symlink in snapshot "
- "path, failed\n");
- error = -EMLINK;
- btrfs_free_path(path);
- goto out_drop_write;
- }
- test_oid =
- BTRFS_I(test->d_inode)->root->root_key.objectid;
- ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
- path, test_oid, parent_oid);
- if (ret == 0) {
- printk(KERN_INFO "Btrfs snapshot creation "
- "failed, looping\n");
- error = -EMLINK;
- btrfs_free_path(path);
- goto out_drop_write;
- }
- btrfs_release_path(snap_src->fs_info->tree_root, path);
- test = test->d_parent;
- }
-create:
- btrfs_free_path(path);
- error = create_snapshot(snap_src, dentry, name, namelen);
+ error = create_snapshot(snap_src, dentry,
+ name, namelen);
} else {
- error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
- dentry, name, namelen);
+ error = create_subvol(BTRFS_I(dir)->root, dentry,
+ name, namelen);
}
- if (error)
- goto out_drop_write;
-
- fsnotify_mkdir(parent->dentry->d_inode, dentry);
+ if (!error)
+ fsnotify_mkdir(dir, dentry);
+out_up_read:
+ up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
out_drop_write:
mnt_drop_write(parent->mnt);
out_dput:
dput(dentry);
out_unlock:
- mutex_unlock(&parent->dentry->d_inode->i_mutex);
+ mutex_unlock(&dir->i_mutex);
return error;
}
-
static int btrfs_defrag_file(struct file *file)
{
struct inode *inode = fdentry(file)->d_inode;
@@ -597,9 +534,8 @@ again:
clear_page_dirty_for_io(page);
btrfs_set_extent_delalloc(inode, page_start, page_end);
-
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
set_page_dirty(page);
+ unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
unlock_page(page);
page_cache_release(page);
balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
@@ -610,7 +546,8 @@ out_unlock:
return 0;
}
-static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
+static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
+ void __user *arg)
{
u64 new_size;
u64 old_size;
@@ -719,10 +656,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
{
struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
struct btrfs_ioctl_vol_args *vol_args;
- struct btrfs_dir_item *di;
- struct btrfs_path *path;
struct file *src_file;
- u64 root_dirid;
int namelen;
int ret = 0;
@@ -740,32 +674,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
goto out;
}
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
-
- root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
- di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
- path, root_dirid,
- vol_args->name, namelen, 0);
- btrfs_free_path(path);
-
- if (di && !IS_ERR(di)) {
- ret = -EEXIST;
- goto out;
- }
-
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto out;
- }
-
if (subvol) {
- ret = btrfs_mksubvol(&file->f_path, vol_args->name,
- file->f_path.dentry->d_inode->i_mode,
- namelen, NULL);
+ ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
+ NULL);
} else {
struct inode *src_inode;
src_file = fget(vol_args->fd);
@@ -782,17 +693,156 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
fput(src_file);
goto out;
}
- ret = btrfs_mksubvol(&file->f_path, vol_args->name,
- file->f_path.dentry->d_inode->i_mode,
- namelen, BTRFS_I(src_inode)->root);
+ ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
+ BTRFS_I(src_inode)->root);
fput(src_file);
}
-
out:
kfree(vol_args);
return ret;
}
+/*
+ * helper to check if the subvolume references other subvolumes
+ */
+static noinline int may_destroy_subvol(struct btrfs_root *root)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = root->root_key.objectid;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
+ &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ BUG_ON(ret == 0);
+
+ ret = 0;
+ if (path->slots[0] > 0) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid == root->root_key.objectid &&
+ key.type == BTRFS_ROOT_REF_KEY)
+ ret = -ENOTEMPTY;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_snap_destroy(struct file *file,
+ void __user *arg)
+{
+ struct dentry *parent = fdentry(file);
+ struct dentry *dentry;
+ struct inode *dir = parent->d_inode;
+ struct inode *inode;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *dest = NULL;
+ struct btrfs_ioctl_vol_args *vol_args;
+ struct btrfs_trans_handle *trans;
+ int namelen;
+ int ret;
+ int err = 0;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ vol_args = memdup_user(arg, sizeof(*vol_args));
+ if (IS_ERR(vol_args))
+ return PTR_ERR(vol_args);
+
+ vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+ namelen = strlen(vol_args->name);
+ if (strchr(vol_args->name, '/') ||
+ strncmp(vol_args->name, "..", namelen) == 0) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = mnt_want_write(file->f_path.mnt);
+ if (err)
+ goto out;
+
+ mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ dentry = lookup_one_len(vol_args->name, parent, namelen);
+ if (IS_ERR(dentry)) {
+ err = PTR_ERR(dentry);
+ goto out_unlock_dir;
+ }
+
+ if (!dentry->d_inode) {
+ err = -ENOENT;
+ goto out_dput;
+ }
+
+ inode = dentry->d_inode;
+ if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ err = -EINVAL;
+ goto out_dput;
+ }
+
+ dest = BTRFS_I(inode)->root;
+
+ mutex_lock(&inode->i_mutex);
+ err = d_invalidate(dentry);
+ if (err)
+ goto out_unlock;
+
+ down_write(&root->fs_info->subvol_sem);
+
+ err = may_destroy_subvol(dest);
+ if (err)
+ goto out_up_write;
+
+ trans = btrfs_start_transaction(root, 1);
+ ret = btrfs_unlink_subvol(trans, root, dir,
+ dest->root_key.objectid,
+ dentry->d_name.name,
+ dentry->d_name.len);
+ BUG_ON(ret);
+
+ btrfs_record_root_in_trans(trans, dest);
+
+ memset(&dest->root_item.drop_progress, 0,
+ sizeof(dest->root_item.drop_progress));
+ dest->root_item.drop_level = 0;
+ btrfs_set_root_refs(&dest->root_item, 0);
+
+ ret = btrfs_insert_orphan_item(trans,
+ root->fs_info->tree_root,
+ dest->root_key.objectid);
+ BUG_ON(ret);
+
+ ret = btrfs_commit_transaction(trans, root);
+ BUG_ON(ret);
+ inode->i_flags |= S_DEAD;
+out_up_write:
+ up_write(&root->fs_info->subvol_sem);
+out_unlock:
+ mutex_unlock(&inode->i_mutex);
+ if (!err) {
+ btrfs_invalidate_inodes(dest);
+ d_delete(dentry);
+ }
+out_dput:
+ dput(dentry);
+out_unlock_dir:
+ mutex_unlock(&dir->i_mutex);
+ mnt_drop_write(file->f_path.mnt);
+out:
+ kfree(vol_args);
+ return err;
+}
+
static int btrfs_ioctl_defrag(struct file *file)
{
struct inode *inode = fdentry(file)->d_inode;
@@ -866,8 +916,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
return ret;
}
-static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
- u64 off, u64 olen, u64 destoff)
+static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+ u64 off, u64 olen, u64 destoff)
{
struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -977,7 +1027,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
/* punch hole in destination first */
btrfs_drop_extents(trans, root, inode, off, off + len,
- off + len, 0, &hint_byte);
+ off + len, 0, &hint_byte, 1);
/* clone data */
key.objectid = src->i_ino;
@@ -1028,7 +1078,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
struct btrfs_file_extent_item);
comp = btrfs_file_extent_compression(leaf, extent);
type = btrfs_file_extent_type(leaf, extent);
- if (type == BTRFS_FILE_EXTENT_REG) {
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
disko = btrfs_file_extent_disk_bytenr(leaf,
extent);
diskl = btrfs_file_extent_disk_num_bytes(leaf,
@@ -1051,7 +1102,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
new_key.objectid = inode->i_ino;
new_key.offset = key.offset + destoff - off;
- if (type == BTRFS_FILE_EXTENT_REG) {
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
ret = btrfs_insert_empty_item(trans, root, path,
&new_key, size);
if (ret)
@@ -1070,8 +1122,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
datao += off - key.offset;
datal -= off - key.offset;
}
- if (key.offset + datao + datal + key.offset >
- off + len)
+ if (key.offset + datao + datal > off + len)
datal = off + len - key.offset - datao;
/* disko == 0 means it's a hole */
if (!disko)
@@ -1181,15 +1232,15 @@ static long btrfs_ioctl_trans_start(struct file *file)
struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
- int ret = 0;
+ int ret;
+ ret = -EPERM;
if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ goto out;
- if (file->private_data) {
- ret = -EINPROGRESS;
+ ret = -EINPROGRESS;
+ if (file->private_data)
goto out;
- }
ret = mnt_want_write(file->f_path.mnt);
if (ret)
@@ -1199,12 +1250,19 @@ static long btrfs_ioctl_trans_start(struct file *file)
root->fs_info->open_ioctl_trans++;
mutex_unlock(&root->fs_info->trans_mutex);
+ ret = -ENOMEM;
trans = btrfs_start_ioctl_transaction(root, 0);
- if (trans)
- file->private_data = trans;
- else
- ret = -ENOMEM;
- /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/
+ if (!trans)
+ goto out_drop;
+
+ file->private_data = trans;
+ return 0;
+
+out_drop:
+ mutex_lock(&root->fs_info->trans_mutex);
+ root->fs_info->open_ioctl_trans--;
+ mutex_unlock(&root->fs_info->trans_mutex);
+ mnt_drop_write(file->f_path.mnt);
out:
return ret;
}
@@ -1220,24 +1278,20 @@ long btrfs_ioctl_trans_end(struct file *file)
struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
- int ret = 0;
trans = file->private_data;
- if (!trans) {
- ret = -EINVAL;
- goto out;
- }
- btrfs_end_transaction(trans, root);
+ if (!trans)
+ return -EINVAL;
file->private_data = NULL;
+ btrfs_end_transaction(trans, root);
+
mutex_lock(&root->fs_info->trans_mutex);
root->fs_info->open_ioctl_trans--;
mutex_unlock(&root->fs_info->trans_mutex);
mnt_drop_write(file->f_path.mnt);
-
-out:
- return ret;
+ return 0;
}
long btrfs_ioctl(struct file *file, unsigned int
@@ -1257,6 +1311,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_snap_create(file, argp, 0);
case BTRFS_IOC_SUBVOL_CREATE:
return btrfs_ioctl_snap_create(file, argp, 1);
+ case BTRFS_IOC_SNAP_DESTROY:
+ return btrfs_ioctl_snap_destroy(file, argp);
case BTRFS_IOC_DEFRAG:
return btrfs_ioctl_defrag(file);
case BTRFS_IOC_RESIZE:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index b320b103fa13..bc49914475eb 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -65,5 +65,6 @@ struct btrfs_ioctl_clone_range_args {
#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
struct btrfs_ioctl_vol_args)
-
+#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
+ struct btrfs_ioctl_vol_args)
#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d6f0806c682f..897fba835f89 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
*
* len is the length of the extent
*
- * This also sets the EXTENT_ORDERED bit on the range in the inode.
- *
* The tree is given a single reference on the ordered extent that was
* inserted.
*/
@@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
entry->start = start;
entry->len = len;
entry->disk_len = disk_len;
+ entry->bytes_left = len;
entry->inode = inode;
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
@@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
&entry->rb_node);
BUG_ON(node);
- set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
- entry_end(entry) - 1, GFP_NOFS);
-
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
&BTRFS_I(inode)->root->fs_info->ordered_extents);
@@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
int ret;
tree = &BTRFS_I(inode)->ordered_tree;
mutex_lock(&tree->mutex);
- clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
- GFP_NOFS);
node = tree_search(tree, file_offset);
if (!node) {
ret = 1;
@@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
goto out;
}
- ret = test_range_bit(io_tree, entry->file_offset,
- entry->file_offset + entry->len - 1,
- EXTENT_ORDERED, 0);
- if (ret == 0)
+ if (io_size > entry->bytes_left) {
+ printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
+ (unsigned long long)entry->bytes_left,
+ (unsigned long long)io_size);
+ }
+ entry->bytes_left -= io_size;
+ if (entry->bytes_left == 0)
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ else
+ ret = 1;
out:
mutex_unlock(&tree->mutex);
return ret == 0;
@@ -460,7 +458,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
* start IO on any dirty ones so the wait doesn't stall waiting
* for pdflush to find them
*/
- btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL);
+ filemap_fdatawrite_range(inode->i_mapping, start, end);
if (wait) {
wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
&entry->flags));
@@ -476,6 +474,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
u64 orig_end;
u64 wait_end;
struct btrfs_ordered_extent *ordered;
+ int found;
if (start + len < start) {
orig_end = INT_LIMIT(loff_t);
@@ -489,19 +488,18 @@ again:
/* start IO across the range first to instantiate any delalloc
* extents
*/
- btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
+ filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
/* The compression code will leave pages locked but return from
* writepage without setting the page writeback. Starting again
* with WB_SYNC_ALL will end up waiting for the IO to actually start.
*/
- btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
+ filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
- btrfs_wait_on_page_writeback_range(inode->i_mapping,
- start >> PAGE_CACHE_SHIFT,
- orig_end >> PAGE_CACHE_SHIFT);
+ filemap_fdatawait_range(inode->i_mapping, start, orig_end);
end = orig_end;
+ found = 0;
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, end);
if (!ordered)
@@ -514,6 +512,7 @@ again:
btrfs_put_ordered_extent(ordered);
break;
}
+ found++;
btrfs_start_ordered_extent(inode, ordered, 1);
end = ordered->file_offset;
btrfs_put_ordered_extent(ordered);
@@ -521,8 +520,8 @@ again:
break;
end--;
}
- if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
- EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
+ if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
+ EXTENT_DELALLOC, 0, NULL)) {
schedule_timeout(1);
goto again;
}
@@ -613,7 +612,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
*/
if (test_range_bit(io_tree, disk_i_size,
ordered->file_offset + ordered->len - 1,
- EXTENT_DELALLOC, 0)) {
+ EXTENT_DELALLOC, 0, NULL)) {
goto out;
}
/*
@@ -664,7 +663,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
*/
if (i_size_test > entry_end(ordered) &&
!test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
- EXTENT_DELALLOC, 0)) {
+ EXTENT_DELALLOC, 0, NULL)) {
new_i_size = min_t(u64, i_size_test, i_size_read(inode));
}
BTRFS_I(inode)->disk_i_size = new_i_size;
@@ -715,90 +714,6 @@ out:
}
-/**
- * taken from mm/filemap.c because it isn't exported
- *
- * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
- * @mapping: address space structure to write
- * @start: offset in bytes where the range starts
- * @end: offset in bytes where the range ends (inclusive)
- * @sync_mode: enable synchronous operation
- *
- * Start writeback against all of a mapping's dirty pages that lie
- * within the byte offsets <start, end> inclusive.
- *
- * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
- * opposed to a regular memory cleansing writeback. The difference between
- * these two operations is that if a dirty page/buffer is encountered, it must
- * be waited upon, and not just skipped over.
- */
-int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
- loff_t end, int sync_mode)
-{
- struct writeback_control wbc = {
- .sync_mode = sync_mode,
- .nr_to_write = mapping->nrpages * 2,
- .range_start = start,
- .range_end = end,
- .for_writepages = 1,
- };
- return btrfs_writepages(mapping, &wbc);
-}
-
-/**
- * taken from mm/filemap.c because it isn't exported
- *
- * wait_on_page_writeback_range - wait for writeback to complete
- * @mapping: target address_space
- * @start: beginning page index
- * @end: ending page index
- *
- * Wait for writeback to complete against pages indexed by start->end
- * inclusive
- */
-int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
- pgoff_t start, pgoff_t end)
-{
- struct pagevec pvec;
- int nr_pages;
- int ret = 0;
- pgoff_t index;
-
- if (end < start)
- return 0;
-
- pagevec_init(&pvec, 0);
- index = start;
- while ((index <= end) &&
- (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_WRITEBACK,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
- unsigned i;
-
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
-
- /* until radix tree lookup accepts end_index */
- if (page->index > end)
- continue;
-
- wait_on_page_writeback(page);
- if (PageError(page))
- ret = -EIO;
- }
- pagevec_release(&pvec);
- cond_resched();
- }
-
- /* Check for outstanding write errors */
- if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
- ret = -ENOSPC;
- if (test_and_clear_bit(AS_EIO, &mapping->flags))
- ret = -EIO;
-
- return ret;
-}
-
/*
* add a given inode to the list of inodes that must be fully on
* disk before a transaction commit finishes.
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 3d31c8827b01..f82e87488ca8 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -85,6 +85,9 @@ struct btrfs_ordered_extent {
/* extent length on disk */
u64 disk_len;
+ /* number of bytes that still need writing */
+ u64 bytes_left;
+
/* flags (described above) */
unsigned long flags;
@@ -150,10 +153,6 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
int btrfs_ordered_update_i_size(struct inode *inode,
struct btrfs_ordered_extent *ordered);
int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
- pgoff_t start, pgoff_t end);
-int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
- loff_t end, int sync_mode);
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 3c0d52af4f80..79cba5fbc28e 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -65,3 +65,23 @@ out:
btrfs_free_path(path);
return ret;
}
+
+int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = offset;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+
+ btrfs_free_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 6d6523da0a30..0d126be22b63 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -309,7 +309,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
}
printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
(unsigned long long)btrfs_header_bytenr(c),
- btrfs_header_level(c), nr,
+ level, nr,
(u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
for (i = 0; i < nr; i++) {
btrfs_node_key_to_cpu(c, &key, i);
@@ -326,10 +326,10 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
btrfs_level_size(root, level - 1),
btrfs_node_ptr_generation(c, i));
if (btrfs_is_leaf(next) &&
- btrfs_header_level(c) != 1)
+ level != 1)
BUG();
if (btrfs_header_level(next) !=
- btrfs_header_level(c) - 1)
+ level - 1)
BUG();
btrfs_print_tree(root, next);
free_extent_buffer(next);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b23dc209ae10..361ad323faac 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -121,6 +121,15 @@ struct inodevec {
int nr;
};
+#define MAX_EXTENTS 128
+
+struct file_extent_cluster {
+ u64 start;
+ u64 end;
+ u64 boundary[MAX_EXTENTS];
+ unsigned int nr;
+};
+
struct reloc_control {
/* block group to relocate */
struct btrfs_block_group_cache *block_group;
@@ -670,6 +679,8 @@ again:
err = ret;
goto out;
}
+ if (ret > 0 && path2->slots[level] > 0)
+ path2->slots[level]--;
eb = path2->nodes[level];
WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
@@ -1609,6 +1620,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
BUG_ON(level == 0);
path->lowest_level = level;
ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
+ path->lowest_level = 0;
if (ret < 0) {
btrfs_free_path(path);
return ret;
@@ -1788,7 +1800,7 @@ static void merge_func(struct btrfs_work *work)
btrfs_end_transaction(trans, root);
}
- btrfs_drop_dead_root(reloc_root);
+ btrfs_drop_snapshot(reloc_root, 0);
if (atomic_dec_and_test(async->num_pending))
complete(async->done);
@@ -2075,9 +2087,6 @@ static int do_relocation(struct btrfs_trans_handle *trans,
ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
BUG_ON(ret);
-
- btrfs_tree_unlock(eb);
- free_extent_buffer(eb);
}
if (!lowest) {
btrfs_tree_unlock(upper->eb);
@@ -2180,7 +2189,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
struct reloc_control *rc)
{
if (test_range_bit(&rc->processed_blocks, bytenr,
- bytenr + blocksize - 1, EXTENT_DIRTY, 1))
+ bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
return 1;
return 0;
}
@@ -2529,51 +2538,94 @@ out:
}
static noinline_for_stack
-int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
+int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
+ u64 block_start)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
+ int ret = 0;
+
+ em = alloc_extent_map(GFP_NOFS);
+ if (!em)
+ return -ENOMEM;
+
+ em->start = start;
+ em->len = end + 1 - start;
+ em->block_len = em->len;
+ em->block_start = block_start;
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+ lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(inode, start, end, 0);
+ }
+ unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+ return ret;
+}
+
+static int relocate_file_extent_cluster(struct inode *inode,
+ struct file_extent_cluster *cluster)
{
u64 page_start;
u64 page_end;
- unsigned long i;
- unsigned long first_index;
+ u64 offset = BTRFS_I(inode)->index_cnt;
+ unsigned long index;
unsigned long last_index;
- unsigned int total_read = 0;
- unsigned int total_dirty = 0;
+ unsigned int dirty_page = 0;
struct page *page;
struct file_ra_state *ra;
- struct btrfs_ordered_extent *ordered;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ int nr = 0;
int ret = 0;
+ if (!cluster->nr)
+ return 0;
+
ra = kzalloc(sizeof(*ra), GFP_NOFS);
if (!ra)
return -ENOMEM;
+ index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+ last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+
mutex_lock(&inode->i_mutex);
- first_index = start >> PAGE_CACHE_SHIFT;
- last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
- /* make sure the dirty trick played by the caller work */
- ret = invalidate_inode_pages2_range(inode->i_mapping,
- first_index, last_index);
+ i_size_write(inode, cluster->end + 1 - offset);
+ ret = setup_extent_mapping(inode, cluster->start - offset,
+ cluster->end - offset, cluster->start);
if (ret)
goto out_unlock;
file_ra_state_init(ra, inode->i_mapping);
- for (i = first_index ; i <= last_index; i++) {
- if (total_read % ra->ra_pages == 0) {
- btrfs_force_ra(inode->i_mapping, ra, NULL, i,
- min(last_index, ra->ra_pages + i - 1));
- }
- total_read++;
-again:
- if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
- BUG_ON(1);
- page = grab_cache_page(inode->i_mapping, i);
+ WARN_ON(cluster->start != cluster->boundary[0]);
+ while (index <= last_index) {
+ page = find_lock_page(inode->i_mapping, index);
if (!page) {
- ret = -ENOMEM;
- goto out_unlock;
+ page_cache_sync_readahead(inode->i_mapping,
+ ra, NULL, index,
+ last_index + 1 - index);
+ page = grab_cache_page(inode->i_mapping, index);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+ }
+
+ if (PageReadahead(page)) {
+ page_cache_async_readahead(inode->i_mapping,
+ ra, NULL, page, index,
+ last_index + 1 - index);
}
+
if (!PageUptodate(page)) {
btrfs_readpage(NULL, page);
lock_page(page);
@@ -2584,75 +2636,79 @@ again:
goto out_unlock;
}
}
- wait_on_page_writeback(page);
page_start = (u64)page->index << PAGE_CACHE_SHIFT;
page_end = page_start + PAGE_CACHE_SIZE - 1;
- lock_extent(io_tree, page_start, page_end, GFP_NOFS);
-
- ordered = btrfs_lookup_ordered_extent(inode, page_start);
- if (ordered) {
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
- unlock_page(page);
- page_cache_release(page);
- btrfs_start_ordered_extent(inode, ordered, 1);
- btrfs_put_ordered_extent(ordered);
- goto again;
- }
+
+ lock_extent(&BTRFS_I(inode)->io_tree,
+ page_start, page_end, GFP_NOFS);
+
set_page_extent_mapped(page);
- if (i == first_index)
- set_extent_bits(io_tree, page_start, page_end,
+ if (nr < cluster->nr &&
+ page_start + offset == cluster->boundary[nr]) {
+ set_extent_bits(&BTRFS_I(inode)->io_tree,
+ page_start, page_end,
EXTENT_BOUNDARY, GFP_NOFS);
+ nr++;
+ }
btrfs_set_extent_delalloc(inode, page_start, page_end);
set_page_dirty(page);
- total_dirty++;
+ dirty_page++;
- unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+ unlock_extent(&BTRFS_I(inode)->io_tree,
+ page_start, page_end, GFP_NOFS);
unlock_page(page);
page_cache_release(page);
+
+ index++;
+ if (nr < cluster->nr &&
+ page_end + 1 + offset == cluster->boundary[nr]) {
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+ dirty_page);
+ dirty_page = 0;
+ }
+ }
+ if (dirty_page) {
+ balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+ dirty_page);
}
+ WARN_ON(nr != cluster->nr);
out_unlock:
mutex_unlock(&inode->i_mutex);
kfree(ra);
- balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
return ret;
}
static noinline_for_stack
-int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
+int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
+ struct file_extent_cluster *cluster)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
- struct extent_map *em;
- u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
- u64 end = start + extent_key->offset - 1;
-
- em = alloc_extent_map(GFP_NOFS);
- em->start = start;
- em->len = extent_key->offset;
- em->block_len = extent_key->offset;
- em->block_start = extent_key->objectid;
- em->bdev = root->fs_info->fs_devices->latest_bdev;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ int ret;
- /* setup extent map to cheat btrfs_readpage */
- lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
- while (1) {
- int ret;
- spin_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
- if (ret != -EEXIST) {
- free_extent_map(em);
- break;
- }
- btrfs_drop_extent_cache(inode, start, end, 0);
+ if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
+ ret = relocate_file_extent_cluster(inode, cluster);
+ if (ret)
+ return ret;
+ cluster->nr = 0;
}
- unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
- return relocate_inode_pages(inode, start, extent_key->offset);
+ if (!cluster->nr)
+ cluster->start = extent_key->objectid;
+ else
+ BUG_ON(cluster->nr >= MAX_EXTENTS);
+ cluster->end = extent_key->objectid + extent_key->offset - 1;
+ cluster->boundary[cluster->nr] = extent_key->objectid;
+ cluster->nr++;
+
+ if (cluster->nr >= MAX_EXTENTS) {
+ ret = relocate_file_extent_cluster(inode, cluster);
+ if (ret)
+ return ret;
+ cluster->nr = 0;
+ }
+ return 0;
}
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
@@ -3198,10 +3254,12 @@ static int check_extent_flags(u64 flags)
return 0;
}
+
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
{
struct rb_root blocks = RB_ROOT;
struct btrfs_key key;
+ struct file_extent_cluster *cluster;
struct btrfs_trans_handle *trans = NULL;
struct btrfs_path *path;
struct btrfs_extent_item *ei;
@@ -3211,10 +3269,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
int ret;
int err = 0;
+ cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
+ if (!cluster)
+ return -ENOMEM;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ rc->extents_found = 0;
+ rc->extents_skipped = 0;
+
rc->search_start = rc->block_group->key.objectid;
clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
GFP_NOFS);
@@ -3301,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
}
nr = trans->blocks_used;
- btrfs_end_transaction_throttle(trans, rc->extent_root);
+ btrfs_end_transaction(trans, rc->extent_root);
trans = NULL;
btrfs_btree_balance_dirty(rc->extent_root, nr);
if (rc->stage == MOVE_DATA_EXTENTS &&
(flags & BTRFS_EXTENT_FLAG_DATA)) {
rc->found_file_extent = 1;
- ret = relocate_data_extent(rc->data_inode, &key);
+ ret = relocate_data_extent(rc->data_inode,
+ &key, cluster);
if (ret < 0) {
err = ret;
break;
@@ -3323,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
btrfs_btree_balance_dirty(rc->extent_root, nr);
}
+ if (!err) {
+ ret = relocate_file_extent_cluster(rc->data_inode, cluster);
+ if (ret < 0)
+ err = ret;
+ }
+
+ kfree(cluster);
+
rc->create_reloc_root = 0;
smp_mb();
@@ -3343,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
}
static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 objectid, u64 size)
+ struct btrfs_root *root, u64 objectid)
{
struct btrfs_path *path;
struct btrfs_inode_item *item;
@@ -3363,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
btrfs_set_inode_generation(leaf, item, 1);
- btrfs_set_inode_size(leaf, item, size);
+ btrfs_set_inode_size(leaf, item, 0);
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
btrfs_mark_buffer_dirty(leaf);
@@ -3399,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
if (err)
goto out;
- err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
- BUG_ON(err);
-
- err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
- group->key.offset, 0, group->key.offset,
- 0, 0, 0);
+ err = __insert_orphan_inode(trans, root, objectid);
BUG_ON(err);
key.objectid = objectid;
@@ -3470,14 +3538,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
btrfs_wait_ordered_extents(fs_info->tree_root, 0);
while (1) {
- mutex_lock(&fs_info->cleaner_mutex);
- btrfs_clean_old_snapshots(fs_info->tree_root);
- mutex_unlock(&fs_info->cleaner_mutex);
-
rc->extents_found = 0;
rc->extents_skipped = 0;
+ mutex_lock(&fs_info->cleaner_mutex);
+
+ btrfs_clean_old_snapshots(fs_info->tree_root);
ret = relocate_block_group(rc);
+
+ mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
err = ret;
break;
@@ -3509,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
}
}
- filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
- rc->block_group->key.objectid,
- rc->block_group->key.objectid +
- rc->block_group->key.offset - 1);
+ filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
+ rc->block_group->key.objectid,
+ rc->block_group->key.objectid +
+ rc->block_group->key.offset - 1);
WARN_ON(rc->block_group->pinned > 0);
WARN_ON(rc->block_group->reserved > 0);
@@ -3525,6 +3594,26 @@ out:
return err;
}
+static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
+
+ memset(&root->root_item.drop_progress, 0,
+ sizeof(root->root_item.drop_progress));
+ root->root_item.drop_level = 0;
+ btrfs_set_root_refs(&root->root_item, 0);
+ ret = btrfs_update_root(trans, root->fs_info->tree_root,
+ &root->root_key, &root->root_item);
+ BUG_ON(ret);
+
+ ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
+ BUG_ON(ret);
+ return 0;
+}
+
/*
* recover relocation interrupted by system crash.
*
@@ -3584,8 +3673,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
fs_root = read_fs_root(root->fs_info,
reloc_root->root_key.offset);
if (IS_ERR(fs_root)) {
- err = PTR_ERR(fs_root);
- goto out;
+ ret = PTR_ERR(fs_root);
+ if (ret != -ENOENT) {
+ err = ret;
+ goto out;
+ }
+ mark_garbage_root(reloc_root);
}
}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 0ddc6d61c55a..9351428f30e2 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -94,17 +94,23 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
goto out;
BUG_ON(ret == 0);
+ if (path->slots[0] == 0) {
+ ret = 1;
+ goto out;
+ }
l = path->nodes[0];
- BUG_ON(path->slots[0] == 0);
slot = path->slots[0] - 1;
btrfs_item_key_to_cpu(l, &found_key, slot);
- if (found_key.objectid != objectid) {
+ if (found_key.objectid != objectid ||
+ found_key.type != BTRFS_ROOT_ITEM_KEY) {
ret = 1;
goto out;
}
- read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
- sizeof(*item));
- memcpy(key, &found_key, sizeof(found_key));
+ if (item)
+ read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
+ sizeof(*item));
+ if (key)
+ memcpy(key, &found_key, sizeof(found_key));
ret = 0;
out:
btrfs_free_path(path);
@@ -249,6 +255,59 @@ err:
return ret;
}
+int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
+{
+ struct extent_buffer *leaf;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int err = 0;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = 0;
+
+ while (1) {
+ ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
+ if (ret < 0) {
+ err = ret;
+ break;
+ }
+
+ leaf = path->nodes[0];
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(tree_root, path);
+ if (ret < 0)
+ err = ret;
+ if (ret != 0)
+ break;
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ btrfs_release_path(tree_root, path);
+
+ if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
+ key.type != BTRFS_ORPHAN_ITEM_KEY)
+ break;
+
+ ret = btrfs_find_dead_roots(tree_root, key.offset);
+ if (ret) {
+ err = ret;
+ break;
+ }
+
+ key.offset++;
+ }
+
+ btrfs_free_path(path);
+ return err;
+}
+
/* drop the root item for 'key' from 'root' */
int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_key *key)
@@ -278,31 +337,57 @@ out:
return ret;
}
-#if 0 /* this will get used when snapshot deletion is implemented */
int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root,
- u64 root_id, u8 type, u64 ref_id)
+ u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
+ const char *name, int name_len)
+
{
+ struct btrfs_path *path;
+ struct btrfs_root_ref *ref;
+ struct extent_buffer *leaf;
struct btrfs_key key;
+ unsigned long ptr;
+ int err = 0;
int ret;
- struct btrfs_path *path;
path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
key.objectid = root_id;
- key.type = type;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = ref_id;
-
+again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- BUG_ON(ret);
-
- ret = btrfs_del_item(trans, tree_root, path);
- BUG_ON(ret);
+ BUG_ON(ret < 0);
+ if (ret == 0) {
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_root_ref);
+
+ WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
+ WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
+ ptr = (unsigned long)(ref + 1);
+ WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
+ *sequence = btrfs_root_ref_sequence(leaf, ref);
+
+ ret = btrfs_del_item(trans, tree_root, path);
+ BUG_ON(ret);
+ } else
+ err = -ENOENT;
+
+ if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+ btrfs_release_path(tree_root, path);
+ key.objectid = ref_id;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = root_id;
+ goto again;
+ }
btrfs_free_path(path);
- return ret;
+ return err;
}
-#endif
int btrfs_find_root_ref(struct btrfs_root *tree_root,
struct btrfs_path *path,
@@ -319,7 +404,6 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
return ret;
}
-
/*
* add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
* or BTRFS_ROOT_BACKREF_KEY.
@@ -335,8 +419,7 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
*/
int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root,
- u64 root_id, u8 type, u64 ref_id,
- u64 dirid, u64 sequence,
+ u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
const char *name, int name_len)
{
struct btrfs_key key;
@@ -346,13 +429,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
unsigned long ptr;
-
path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
key.objectid = root_id;
- key.type = type;
+ key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = ref_id;
-
+again:
ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
sizeof(*ref) + name_len);
BUG_ON(ret);
@@ -366,6 +450,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
write_extent_buffer(leaf, name, ptr, name_len);
btrfs_mark_buffer_dirty(leaf);
+ if (key.type == BTRFS_ROOT_BACKREF_KEY) {
+ btrfs_release_path(tree_root, path);
+ key.objectid = ref_id;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = root_id;
+ goto again;
+ }
+
btrfs_free_path(path);
- return ret;
+ return 0;
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9f179d4832d5..9de9b2236419 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -26,7 +26,6 @@
#include <linux/init.h>
#include <linux/seq_file.h>
#include <linux/string.h>
-#include <linux/smp_lock.h>
#include <linux/backing-dev.h>
#include <linux/mount.h>
#include <linux/mpage.h>
@@ -52,7 +51,7 @@
#include "export.h"
#include "compression.h"
-static struct super_operations btrfs_super_ops;
+static const struct super_operations btrfs_super_ops;
static void btrfs_put_super(struct super_block *sb)
{
@@ -345,7 +344,9 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_export_op = &btrfs_export_ops;
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
+#ifdef CONFIG_BTRFS_POSIX_ACL
sb->s_flags |= MS_POSIXACL;
+#endif
tree_root = open_ctree(sb, fs_devices, (char *)data);
@@ -676,7 +677,8 @@ static int btrfs_unfreeze(struct super_block *sb)
return 0;
}
-static struct super_operations btrfs_super_ops = {
+static const struct super_operations btrfs_super_ops = {
+ .drop_inode = btrfs_drop_inode,
.delete_inode = btrfs_delete_inode,
.put_super = btrfs_put_super,
.sync_fs = btrfs_sync_fs,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4e83457ea253..0b8f36d4400a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -40,6 +40,12 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
}
}
+static noinline void switch_commit_root(struct btrfs_root *root)
+{
+ free_extent_buffer(root->commit_root);
+ root->commit_root = btrfs_root_node(root);
+}
+
/*
* either allocate a new transaction or hop into the existing one
*/
@@ -98,7 +104,6 @@ static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
{
if (root->ref_cows && root->last_trans < trans->transid) {
WARN_ON(root == root->fs_info->extent_root);
- WARN_ON(root->root_item.refs == 0);
WARN_ON(root->commit_root != root->node);
radix_tree_tag_set(&root->fs_info->fs_roots_radix,
@@ -181,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
h->alloc_exclude_start = 0;
h->delayed_ref_updates = 0;
+ if (!current->journal_info)
+ current->journal_info = h;
+
root->fs_info->running_transaction->use_count++;
record_root_in_trans(h, root);
mutex_unlock(&root->fs_info->trans_mutex);
@@ -312,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
wake_up(&cur_trans->writer_wait);
put_transaction(cur_trans);
mutex_unlock(&info->trans_mutex);
+
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
memset(trans, 0, sizeof(*trans));
kmem_cache_free(btrfs_trans_handle_cachep, trans);
@@ -444,9 +455,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
btrfs_write_dirty_block_groups(trans, root);
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- BUG_ON(ret);
-
while (1) {
old_root_bytenr = btrfs_root_bytenr(&root->root_item);
if (old_root_bytenr == root->node->start)
@@ -457,13 +465,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
&root->root_key,
&root->root_item);
BUG_ON(ret);
- btrfs_write_dirty_block_groups(trans, root);
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ ret = btrfs_write_dirty_block_groups(trans, root);
BUG_ON(ret);
}
- free_extent_buffer(root->commit_root);
- root->commit_root = btrfs_root_node(root);
+
+ if (root != root->fs_info->extent_root)
+ switch_commit_root(root);
+
return 0;
}
@@ -495,10 +504,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
root = list_entry(next, struct btrfs_root, dirty_list);
update_cowonly_root(trans, root);
-
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- BUG_ON(ret);
}
+
+ down_write(&fs_info->extent_commit_sem);
+ switch_commit_root(fs_info->extent_root);
+ up_write(&fs_info->extent_commit_sem);
+
return 0;
}
@@ -544,8 +555,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
btrfs_update_reloc_root(trans, root);
if (root->commit_root != root->node) {
- free_extent_buffer(root->commit_root);
- root->commit_root = btrfs_root_node(root);
+ switch_commit_root(root);
btrfs_set_root_node(&root->root_item,
root->node);
}
@@ -593,6 +603,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
return 0;
}
+#if 0
/*
* when dropping snapshots, we generate a ton of delayed refs, and it makes
* sense not to join the transaction while it is trying to flush the current
@@ -681,6 +692,7 @@ int btrfs_drop_dead_root(struct btrfs_root *root)
btrfs_btree_balance_dirty(tree_root, nr);
return ret;
}
+#endif
/*
* new snapshots need to be created at a very specific time in the
@@ -713,7 +725,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
key.objectid = objectid;
- key.offset = 0;
+ /* record when the snapshot was created in key.offset */
+ key.offset = trans->transid;
btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
old = btrfs_lock_root_node(root);
@@ -736,6 +749,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
memcpy(&pending->root_key, &key, sizeof(key));
fail:
kfree(new_root_item);
+ btrfs_unreserve_metadata_space(root, 6);
return ret;
}
@@ -771,24 +785,14 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
ret = btrfs_update_inode(trans, parent_root, parent_inode);
BUG_ON(ret);
- /* add the backref first */
ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
pending->root_key.objectid,
- BTRFS_ROOT_BACKREF_KEY,
parent_root->root_key.objectid,
parent_inode->i_ino, index, pending->name,
namelen);
BUG_ON(ret);
- /* now add the forward ref */
- ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
- parent_root->root_key.objectid,
- BTRFS_ROOT_REF_KEY,
- pending->root_key.objectid,
- parent_inode->i_ino, index, pending->name,
- namelen);
-
inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
d_instantiate(pending->dentry, inode);
fail:
@@ -850,6 +854,16 @@ static void update_super_roots(struct btrfs_root *root)
super->root_level = root_item->level;
}
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
+{
+ int ret = 0;
+ spin_lock(&info->new_trans_lock);
+ if (info->running_transaction)
+ ret = info->running_transaction->in_commit;
+ spin_unlock(&info->new_trans_lock);
+ return ret;
+}
+
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
@@ -857,7 +871,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
unsigned long timeout = 1;
struct btrfs_transaction *cur_trans;
struct btrfs_transaction *prev_trans = NULL;
- struct extent_io_tree *pinned_copy;
DEFINE_WAIT(wait);
int ret;
int should_grow = 0;
@@ -898,13 +911,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return 0;
}
- pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
- if (!pinned_copy)
- return -ENOMEM;
-
- extent_io_tree_init(pinned_copy,
- root->fs_info->btree_inode->i_mapping, GFP_NOFS);
-
trans->transaction->in_commit = 1;
trans->transaction->blocked = 1;
if (cur_trans->list.prev != &root->fs_info->trans_list) {
@@ -941,9 +947,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_unlock(&root->fs_info->trans_mutex);
- if (flush_on_commit || snap_pending) {
- if (flush_on_commit)
- btrfs_start_delalloc_inodes(root);
+ if (flush_on_commit) {
+ btrfs_start_delalloc_inodes(root);
+ ret = btrfs_wait_ordered_extents(root, 0);
+ BUG_ON(ret);
+ } else if (snap_pending) {
ret = btrfs_wait_ordered_extents(root, 1);
BUG_ON(ret);
}
@@ -1000,6 +1008,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = commit_cowonly_roots(trans, root);
BUG_ON(ret);
+ btrfs_prepare_extent_commit(trans, root);
+
cur_trans = root->fs_info->running_transaction;
spin_lock(&root->fs_info->new_trans_lock);
root->fs_info->running_transaction = NULL;
@@ -1007,15 +1017,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_set_root_node(&root->fs_info->tree_root->root_item,
root->fs_info->tree_root->node);
- free_extent_buffer(root->fs_info->tree_root->commit_root);
- root->fs_info->tree_root->commit_root =
- btrfs_root_node(root->fs_info->tree_root);
+ switch_commit_root(root->fs_info->tree_root);
btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
root->fs_info->chunk_root->node);
- free_extent_buffer(root->fs_info->chunk_root->commit_root);
- root->fs_info->chunk_root->commit_root =
- btrfs_root_node(root->fs_info->chunk_root);
+ switch_commit_root(root->fs_info->chunk_root);
update_super_roots(root);
@@ -1027,8 +1033,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
sizeof(root->fs_info->super_copy));
- btrfs_copy_pinned(root, pinned_copy);
-
trans->transaction->blocked = 0;
wake_up(&root->fs_info->transaction_wait);
@@ -1044,8 +1048,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
*/
mutex_unlock(&root->fs_info->tree_log_mutex);
- btrfs_finish_extent_commit(trans, root, pinned_copy);
- kfree(pinned_copy);
+ btrfs_finish_extent_commit(trans, root);
/* do the directory inserts of any pending snapshot creations */
finish_pending_snapshots(trans, root->fs_info);
@@ -1055,6 +1058,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
cur_trans->commit_done = 1;
root->fs_info->last_trans_committed = cur_trans->transid;
+
wake_up(&cur_trans->commit_wait);
put_transaction(cur_trans);
@@ -1062,6 +1066,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_unlock(&root->fs_info->trans_mutex);
+ if (current->journal_info == trans)
+ current->journal_info = NULL;
+
kmem_cache_free(btrfs_trans_handle_cachep, trans);
return ret;
}
@@ -1080,8 +1087,13 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
while (!list_empty(&list)) {
root = list_entry(list.next, struct btrfs_root, root_list);
- list_del_init(&root->root_list);
- btrfs_drop_dead_root(root);
+ list_del(&root->root_list);
+
+ if (btrfs_header_backref_rev(root->node) <
+ BTRFS_MIXED_BACKREF_REV)
+ btrfs_drop_snapshot(root, 0);
+ else
+ btrfs_drop_snapshot(root, 1);
}
return 0;
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 961c3ee5a2e1..663c67404918 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -107,4 +107,5 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages);
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c13922206d1b..7827841b55cb 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -263,8 +263,8 @@ static int process_one_buffer(struct btrfs_root *log,
struct walk_control *wc, u64 gen)
{
if (wc->pin)
- btrfs_update_pinned_extents(log->fs_info->extent_root,
- eb->start, eb->len, 1);
+ btrfs_pin_extent(log->fs_info->extent_root,
+ eb->start, eb->len, 0);
if (btrfs_buffer_uptodate(eb, gen)) {
if (wc->write)
@@ -534,7 +534,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
saved_nbytes = inode_get_bytes(inode);
/* drop any overlapping extents */
ret = btrfs_drop_extents(trans, root, inode,
- start, extent_end, extent_end, start, &alloc_hint);
+ start, extent_end, extent_end, start, &alloc_hint, 1);
BUG_ON(ret);
if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -797,7 +797,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
return -ENOENT;
inode = read_one_inode(root, key->objectid);
- BUG_ON(!dir);
+ BUG_ON(!inode);
ref_ptr = btrfs_item_ptr_offset(eb, slot);
ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
@@ -2605,7 +2605,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
extent);
cs = btrfs_file_extent_offset(src, extent);
cl = btrfs_file_extent_num_bytes(src,
- extent);;
+ extent);
if (btrfs_file_extent_compression(src,
extent)) {
cs = 0;
@@ -2841,7 +2841,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
break;
- if (parent == sb->s_root)
+ if (IS_ROOT(parent))
break;
parent = parent->d_parent;
@@ -2880,6 +2880,12 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
+ if (root != BTRFS_I(inode)->root ||
+ btrfs_root_refs(&root->root_item) == 0) {
+ ret = 1;
+ goto end_no_trans;
+ }
+
ret = check_parent_dirs_for_sync(trans, inode, parent,
sb, last_committed);
if (ret)
@@ -2907,12 +2913,15 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
break;
inode = parent->d_inode;
+ if (root != BTRFS_I(inode)->root)
+ break;
+
if (BTRFS_I(inode)->generation >
root->fs_info->last_trans_committed) {
ret = btrfs_log_inode(trans, root, inode, inode_only);
BUG_ON(ret);
}
- if (parent == sb->s_root)
+ if (IS_ROOT(parent))
break;
parent = parent->d_parent;
@@ -2951,7 +2960,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
struct btrfs_key tmp_key;
struct btrfs_root *log;
struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
- u64 highest_inode;
struct walk_control wc = {
.process_func = process_one_buffer,
.stage = 0,
@@ -3010,11 +3018,6 @@ again:
path);
BUG_ON(ret);
}
- ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
- if (ret == 0) {
- wc.replay_dest->highest_inode = highest_inode;
- wc.replay_dest->last_inode_alloc = highest_inode;
- }
key.offset = found_key.offset - 1;
wc.replay_dest->log_root = NULL;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3ab80e9cd767..7eda483d7b5a 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -260,7 +260,7 @@ loop_lock:
num_run++;
batch_run++;
- if (bio_sync(cur))
+ if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
num_sync_run++;
if (need_resched()) {
@@ -276,7 +276,7 @@ loop_lock:
* is now congested. Back off and let other work structs
* run instead
*/
- if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
+ if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
fs_info->fs_devices->open_devices > 1) {
struct io_context *ioc;
@@ -446,8 +446,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
goto error;
device->name = kstrdup(orig_dev->name, GFP_NOFS);
- if (!device->name)
+ if (!device->name) {
+ kfree(device);
goto error;
+ }
device->devid = orig_dev->devid;
device->work.func = pending_bios_fn;
@@ -719,9 +721,9 @@ error:
* called very infrequently and that a given device has a small number
* of extents
*/
-static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
- struct btrfs_device *device,
- u64 num_bytes, u64 *start)
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 *start, u64 *max_avail)
{
struct btrfs_key key;
struct btrfs_root *root = device->dev_root;
@@ -758,9 +760,13 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
if (ret < 0)
goto error;
- ret = btrfs_previous_item(root, path, 0, key.type);
- if (ret < 0)
- goto error;
+ if (ret > 0) {
+ ret = btrfs_previous_item(root, path, key.objectid, key.type);
+ if (ret < 0)
+ goto error;
+ if (ret > 0)
+ start_found = 1;
+ }
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
while (1) {
@@ -803,6 +809,10 @@ no_more_items:
if (last_byte < search_start)
last_byte = search_start;
hole_size = key.offset - last_byte;
+
+ if (hole_size > *max_avail)
+ *max_avail = hole_size;
+
if (key.offset > last_byte &&
hole_size >= num_bytes) {
*start = last_byte;
@@ -1621,6 +1631,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
device->fs_devices->total_rw_bytes += diff;
device->total_bytes = new_size;
+ device->disk_total_bytes = new_size;
btrfs_clear_space_info_full(device->dev_root->fs_info);
return btrfs_update_device(trans, device);
@@ -1726,6 +1737,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
extent_root = root->fs_info->extent_root;
em_tree = &root->fs_info->mapping_tree.map_tree;
+ ret = btrfs_can_relocate(extent_root, chunk_offset);
+ if (ret)
+ return -ENOSPC;
+
/* step one, relocate all the extents inside this chunk */
ret = btrfs_relocate_block_group(extent_root, chunk_offset);
BUG_ON(ret);
@@ -1739,9 +1754,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
* step two, delete the device extents and the
* chunk tree entries
*/
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
BUG_ON(em->start > chunk_offset ||
em->start + em->len < chunk_offset);
@@ -1770,9 +1785,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
BUG_ON(ret);
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
kfree(map);
em->bdev = NULL;
@@ -1797,12 +1812,15 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
struct btrfs_key found_key;
u64 chunk_tree = chunk_root->root_key.objectid;
u64 chunk_type;
+ bool retried = false;
+ int failed = 0;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+again:
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
@@ -1832,7 +1850,10 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
found_key.objectid,
found_key.offset);
- BUG_ON(ret);
+ if (ret == -ENOSPC)
+ failed++;
+ else if (ret)
+ BUG();
}
if (found_key.offset == 0)
@@ -1840,6 +1861,14 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
key.offset = found_key.offset - 1;
}
ret = 0;
+ if (failed && !retried) {
+ failed = 0;
+ retried = true;
+ goto again;
+ } else if (failed && retried) {
+ WARN_ON(1);
+ ret = -ENOSPC;
+ }
error:
btrfs_free_path(path);
return ret;
@@ -1884,6 +1913,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
continue;
ret = btrfs_shrink_device(device, old_size - size_to_free);
+ if (ret == -ENOSPC)
+ break;
BUG_ON(ret);
trans = btrfs_start_transaction(dev_root, 1);
@@ -1928,9 +1959,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
chunk = btrfs_item_ptr(path->nodes[0],
path->slots[0],
struct btrfs_chunk);
- key.offset = found_key.offset;
/* chunk zero is special */
- if (key.offset == 0)
+ if (found_key.offset == 0)
break;
btrfs_release_path(chunk_root, path);
@@ -1938,7 +1968,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
chunk_root->root_key.objectid,
found_key.objectid,
found_key.offset);
- BUG_ON(ret);
+ BUG_ON(ret && ret != -ENOSPC);
+ key.offset = found_key.offset - 1;
}
ret = 0;
error:
@@ -1964,10 +1995,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
u64 chunk_offset;
int ret;
int slot;
+ int failed = 0;
+ bool retried = false;
struct extent_buffer *l;
struct btrfs_key key;
struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
+ u64 old_size = device->total_bytes;
u64 diff = device->total_bytes - new_size;
if (new_size >= device->total_bytes)
@@ -1977,12 +2011,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 1);
- if (!trans) {
- ret = -ENOMEM;
- goto done;
- }
-
path->reada = 2;
lock_chunks(root);
@@ -1991,8 +2019,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
if (device->writeable)
device->fs_devices->total_rw_bytes -= diff;
unlock_chunks(root);
- btrfs_end_transaction(trans, root);
+again:
key.objectid = device->devid;
key.offset = (u64)-1;
key.type = BTRFS_DEV_EXTENT_KEY;
@@ -2007,21 +2035,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
goto done;
if (ret) {
ret = 0;
- goto done;
+ btrfs_release_path(root, path);
+ break;
}
l = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
- if (key.objectid != device->devid)
- goto done;
+ if (key.objectid != device->devid) {
+ btrfs_release_path(root, path);
+ break;
+ }
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
length = btrfs_dev_extent_length(l, dev_extent);
- if (key.offset + length <= new_size)
+ if (key.offset + length <= new_size) {
+ btrfs_release_path(root, path);
break;
+ }
chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -2030,8 +2063,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
chunk_offset);
- if (ret)
+ if (ret && ret != -ENOSPC)
goto done;
+ if (ret == -ENOSPC)
+ failed++;
+ key.offset -= 1;
+ }
+
+ if (failed && !retried) {
+ failed = 0;
+ retried = true;
+ goto again;
+ } else if (failed && retried) {
+ ret = -ENOSPC;
+ lock_chunks(root);
+
+ device->total_bytes = old_size;
+ if (device->writeable)
+ device->fs_devices->total_rw_bytes += diff;
+ unlock_chunks(root);
+ goto done;
}
/* Shrinking succeeded, else we would be at "done". */
@@ -2171,6 +2222,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
max_chunk_size);
again:
+ max_avail = 0;
if (!map || map->num_stripes != num_stripes) {
kfree(map);
map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -2219,7 +2271,8 @@ again:
if (device->in_fs_metadata && avail >= min_free) {
ret = find_free_dev_extent(trans, device,
- min_free, &dev_offset);
+ min_free, &dev_offset,
+ &max_avail);
if (ret == 0) {
list_move_tail(&device->dev_alloc_list,
&private_devs);
@@ -2282,9 +2335,9 @@ again:
em->block_len = em->len;
em_tree = &extent_root->fs_info->mapping_tree.map_tree;
- spin_lock(&em_tree->lock);
+ write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
- spin_unlock(&em_tree->lock);
+ write_unlock(&em_tree->lock);
BUG_ON(ret);
free_extent_map(em);
@@ -2479,9 +2532,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
int readonly = 0;
int i;
- spin_lock(&map_tree->map_tree.lock);
+ read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
- spin_unlock(&map_tree->map_tree.lock);
+ read_unlock(&map_tree->map_tree.lock);
if (!em)
return 1;
@@ -2506,11 +2559,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
struct extent_map *em;
while (1) {
- spin_lock(&tree->map_tree.lock);
+ write_lock(&tree->map_tree.lock);
em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
if (em)
remove_extent_mapping(&tree->map_tree, em);
- spin_unlock(&tree->map_tree.lock);
+ write_unlock(&tree->map_tree.lock);
if (!em)
break;
kfree(em->bdev);
@@ -2528,9 +2581,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
struct extent_map_tree *em_tree = &map_tree->map_tree;
int ret;
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, len);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
BUG_ON(!em);
BUG_ON(em->start > logical || em->start + em->len < logical);
@@ -2592,9 +2645,9 @@ again:
atomic_set(&multi->error, 0);
}
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, *length);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
if (!em && unplug_page)
return 0;
@@ -2751,9 +2804,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 stripe_nr;
int i, j, nr = 0;
- spin_lock(&em_tree->lock);
+ read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_start, 1);
- spin_unlock(&em_tree->lock);
+ read_unlock(&em_tree->lock);
BUG_ON(!em || em->start != chunk_start);
map = (struct map_lookup *)em->bdev;
@@ -2795,26 +2848,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
}
}
- for (i = 0; i > nr; i++) {
- struct btrfs_multi_bio *multi;
- struct btrfs_bio_stripe *stripe;
- int ret;
-
- length = 1;
- ret = btrfs_map_block(map_tree, WRITE, buf[i],
- &length, &multi, 0);
- BUG_ON(ret);
-
- stripe = multi->stripes;
- for (j = 0; j < multi->num_stripes; j++) {
- if (stripe->physical >= physical &&
- physical < stripe->physical + length)
- break;
- }
- BUG_ON(j >= multi->num_stripes);
- kfree(multi);
- }
-
*logical = buf;
*naddrs = nr;
*stripe_len = map->stripe_len;
@@ -2911,7 +2944,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
bio->bi_rw |= rw;
spin_lock(&device->io_lock);
- if (bio_sync(bio))
+ if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
pending_bios = &device->pending_sync_bios;
else
pending_bios = &device->pending_bios;
@@ -3061,9 +3094,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
- spin_lock(&map_tree->map_tree.lock);
+ read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
- spin_unlock(&map_tree->map_tree.lock);
+ read_unlock(&map_tree->map_tree.lock);
/* already mapped? */
if (em && em->start <= logical && em->start + em->len > logical) {
@@ -3122,9 +3155,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
map->stripes[i].dev->in_fs_metadata = 1;
}
- spin_lock(&map_tree->map_tree.lock);
+ write_lock(&map_tree->map_tree.lock);
ret = add_extent_mapping(&map_tree->map_tree, em);
- spin_unlock(&map_tree->map_tree.lock);
+ write_unlock(&map_tree->map_tree.lock);
BUG_ON(ret);
free_extent_map(em);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5139a833f721..31b0fabdd2ea 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -181,4 +181,7 @@ int btrfs_balance(struct btrfs_root *dev_root);
void btrfs_unlock_volumes(void);
void btrfs_lock_volumes(void);
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
+int find_free_dev_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device, u64 num_bytes,
+ u64 *start, u64 *max_avail);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a9d3bf4d2689..b0fc93f95fd0 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -260,7 +260,7 @@ err:
* attributes are handled directly.
*/
struct xattr_handler *btrfs_xattr_handlers[] = {
-#ifdef CONFIG_FS_POSIX_ACL
+#ifdef CONFIG_BTRFS_POSIX_ACL
&btrfs_xattr_acl_access_handler,
&btrfs_xattr_acl_default_handler,
#endif
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index ecfbce836d32..3e2b90eaa239 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -208,7 +208,7 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
*total_in = 0;
workspace = find_zlib_workspace();
- if (!workspace)
+ if (IS_ERR(workspace))
return -1;
if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
@@ -366,7 +366,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
char *kaddr;
workspace = find_zlib_workspace();
- if (!workspace)
+ if (IS_ERR(workspace))
return -ENOMEM;
data_in = kmap(pages_in[page_in_index]);
@@ -547,7 +547,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
return -ENOMEM;
workspace = find_zlib_workspace();
- if (!workspace)
+ if (IS_ERR(workspace))
return -ENOMEM;
workspace->inf_strm.next_in = data_in;