diff options
Diffstat (limited to 'fs/btrfs')
-rw-r--r-- | fs/btrfs/Makefile | 19 | ||||
-rw-r--r-- | fs/btrfs/acl.c | 20 | ||||
-rw-r--r-- | fs/btrfs/async-thread.c | 67 | ||||
-rw-r--r-- | fs/btrfs/async-thread.h | 2 | ||||
-rw-r--r-- | fs/btrfs/ctree.c | 327 | ||||
-rw-r--r-- | fs/btrfs/ctree.h | 90 | ||||
-rw-r--r-- | fs/btrfs/delayed-ref.c | 1 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 110 | ||||
-rw-r--r-- | fs/btrfs/extent-tree.c | 447 | ||||
-rw-r--r-- | fs/btrfs/extent_io.c | 183 | ||||
-rw-r--r-- | fs/btrfs/extent_map.c | 18 | ||||
-rw-r--r-- | fs/btrfs/file.c | 95 | ||||
-rw-r--r-- | fs/btrfs/free-space-cache.c | 535 | ||||
-rw-r--r-- | fs/btrfs/free-space-cache.h | 44 | ||||
-rw-r--r-- | fs/btrfs/inode-map.c | 2 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 188 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 60 | ||||
-rw-r--r-- | fs/btrfs/locking.c | 4 | ||||
-rw-r--r-- | fs/btrfs/ordered-data.c | 2 | ||||
-rw-r--r-- | fs/btrfs/super.c | 86 | ||||
-rw-r--r-- | fs/btrfs/transaction.c | 13 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 14 | ||||
-rw-r--r-- | fs/btrfs/volumes.c | 200 | ||||
-rw-r--r-- | fs/btrfs/volumes.h | 18 |
24 files changed, 1543 insertions, 1002 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9adf5e4f7e96..94212844a9bc 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -1,25 +1,10 @@ -ifneq ($(KERNELRELEASE),) -# kbuild part of makefile obj-$(CONFIG_BTRFS_FS) := btrfs.o -btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ + +btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ file-item.o inode-item.o inode-map.o disk-io.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ compression.o delayed-ref.o -else - -# Normal Makefile - -KERNELDIR := /lib/modules/`uname -r`/build -all: - $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules - -modules_install: - $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install -clean: - $(MAKE) -C $(KERNELDIR) M=`pwd` clean - -endif diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 1d53b62dbba5..cbba000dccbe 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -60,15 +60,20 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) return ERR_PTR(-EINVAL); } + /* Handle the cached NULL acl case without locking */ + acl = ACCESS_ONCE(*p_acl); + if (!acl) + return acl; + spin_lock(&inode->i_lock); - if (*p_acl != BTRFS_ACL_NOT_CACHED) - acl = posix_acl_dup(*p_acl); + acl = *p_acl; + if (acl != BTRFS_ACL_NOT_CACHED) + acl = posix_acl_dup(acl); spin_unlock(&inode->i_lock); - if (acl) + if (acl != BTRFS_ACL_NOT_CACHED) return acl; - size = __btrfs_getxattr(inode, name, "", 0); if (size > 0) { value = kzalloc(size, GFP_NOFS); @@ -80,9 +85,12 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) btrfs_update_cached_acl(inode, p_acl, acl); } kfree(value); - } else if (size == -ENOENT) { + } else if (size == -ENOENT || size == -ENODATA || size == 0) { + /* FIXME, who returns -ENOENT? I think nobody */ acl = NULL; btrfs_update_cached_acl(inode, p_acl, acl); + } else { + acl = ERR_PTR(-EIO); } return acl; @@ -256,7 +264,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) } if (!acl) - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } if (IS_POSIXACL(dir) && acl) { diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index c84ca1f5259a..502c3d61de62 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -20,12 +20,12 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/freezer.h> -#include <linux/ftrace.h> #include "async-thread.h" #define WORK_QUEUED_BIT 0 #define WORK_DONE_BIT 1 #define WORK_ORDER_DONE_BIT 2 +#define WORK_HIGH_PRIO_BIT 3 /* * container for the kthread task pointer and the list of pending work @@ -37,6 +37,7 @@ struct btrfs_worker_thread { /* list of struct btrfs_work that are waiting for service */ struct list_head pending; + struct list_head prio_pending; /* list of worker threads from struct btrfs_workers */ struct list_head worker_list; @@ -104,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers, spin_lock_irqsave(&workers->lock, flags); - while (!list_empty(&workers->order_list)) { - work = list_entry(workers->order_list.next, - struct btrfs_work, order_list); - + while (1) { + if (!list_empty(&workers->prio_order_list)) { + work = list_entry(workers->prio_order_list.next, + struct btrfs_work, order_list); + } else if (!list_empty(&workers->order_list)) { + work = list_entry(workers->order_list.next, + struct btrfs_work, order_list); + } else { + break; + } if (!test_bit(WORK_DONE_BIT, &work->flags)) break; @@ -144,8 +151,14 @@ static int worker_loop(void *arg) do { spin_lock_irq(&worker->lock); again_locked: - while (!list_empty(&worker->pending)) { - cur = worker->pending.next; + while (1) { + if (!list_empty(&worker->prio_pending)) + cur = worker->prio_pending.next; + else if (!list_empty(&worker->pending)) + cur = worker->pending.next; + else + break; + work = list_entry(cur, struct btrfs_work, list); list_del(&work->list); clear_bit(WORK_QUEUED_BIT, &work->flags); @@ -164,7 +177,6 @@ again_locked: spin_lock_irq(&worker->lock); check_idle_worker(worker); - } if (freezing(current)) { worker->working = 0; @@ -179,7 +191,8 @@ again_locked: * jump_in? */ smp_mb(); - if (!list_empty(&worker->pending)) + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) continue; /* @@ -192,13 +205,18 @@ again_locked: */ schedule_timeout(1); smp_mb(); - if (!list_empty(&worker->pending)) + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) continue; + if (kthread_should_stop()) + break; + /* still no more work?, sleep for real */ spin_lock_irq(&worker->lock); set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&worker->pending)) + if (!list_empty(&worker->pending) || + !list_empty(&worker->prio_pending)) goto again_locked; /* @@ -208,7 +226,8 @@ again_locked: worker->working = 0; spin_unlock_irq(&worker->lock); - schedule(); + if (!kthread_should_stop()) + schedule(); } __set_current_state(TASK_RUNNING); } @@ -245,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) INIT_LIST_HEAD(&workers->worker_list); INIT_LIST_HEAD(&workers->idle_list); INIT_LIST_HEAD(&workers->order_list); + INIT_LIST_HEAD(&workers->prio_order_list); spin_lock_init(&workers->lock); workers->max_workers = max; workers->idle_thresh = 32; @@ -270,6 +290,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) } INIT_LIST_HEAD(&worker->pending); + INIT_LIST_HEAD(&worker->prio_pending); INIT_LIST_HEAD(&worker->worker_list); spin_lock_init(&worker->lock); atomic_set(&worker->num_pending, 0); @@ -393,7 +414,10 @@ int btrfs_requeue_work(struct btrfs_work *work) goto out; spin_lock_irqsave(&worker->lock, flags); - list_add_tail(&work->list, &worker->pending); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) + list_add_tail(&work->list, &worker->prio_pending); + else + list_add_tail(&work->list, &worker->pending); atomic_inc(&worker->num_pending); /* by definition we're busy, take ourselves off the idle @@ -419,6 +443,11 @@ out: return 0; } +void btrfs_set_work_high_prio(struct btrfs_work *work) +{ + set_bit(WORK_HIGH_PRIO_BIT, &work->flags); +} + /* * places a struct btrfs_work into the pending queue of one of the kthreads */ @@ -435,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) worker = find_worker(workers); if (workers->ordered) { spin_lock_irqsave(&workers->lock, flags); - list_add_tail(&work->order_list, &workers->order_list); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) { + list_add_tail(&work->order_list, + &workers->prio_order_list); + } else { + list_add_tail(&work->order_list, &workers->order_list); + } spin_unlock_irqrestore(&workers->lock, flags); } else { INIT_LIST_HEAD(&work->order_list); @@ -443,7 +477,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) spin_lock_irqsave(&worker->lock, flags); - list_add_tail(&work->list, &worker->pending); + if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) + list_add_tail(&work->list, &worker->prio_pending); + else + list_add_tail(&work->list, &worker->pending); atomic_inc(&worker->num_pending); check_busy_worker(worker); diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 31be4ed8b63e..1b511c109db6 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -85,6 +85,7 @@ struct btrfs_workers { * of work items waiting for completion */ struct list_head order_list; + struct list_head prio_order_list; /* lock for finding the next worker thread to queue on */ spinlock_t lock; @@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); int btrfs_stop_workers(struct btrfs_workers *workers); void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); int btrfs_requeue_work(struct btrfs_work *work); +void btrfs_set_work_high_prio(struct btrfs_work *work); #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dbb724124633..a99f1c2a710d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1244,9 +1244,9 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, * readahead one full node of leaves, finding things that are close * to the block in 'slot', and triggering ra on them. */ -static noinline void reada_for_search(struct btrfs_root *root, - struct btrfs_path *path, - int level, int slot, u64 objectid) +static void reada_for_search(struct btrfs_root *root, + struct btrfs_path *path, + int level, int slot, u64 objectid) { struct extent_buffer *node; struct btrfs_disk_key disk_key; @@ -1325,12 +1325,12 @@ static noinline int reada_for_balance(struct btrfs_root *root, int ret = 0; int blocksize; - parent = path->nodes[level - 1]; + parent = path->nodes[level + 1]; if (!parent) return 0; nritems = btrfs_header_nritems(parent); - slot = path->slots[level]; + slot = path->slots[level + 1]; blocksize = btrfs_level_size(root, level); if (slot > 0) { @@ -1341,7 +1341,7 @@ static noinline int reada_for_balance(struct btrfs_root *root, block1 = 0; free_extent_buffer(eb); } - if (slot < nritems) { + if (slot + 1 < nritems) { block2 = btrfs_node_blockptr(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1); eb = btrfs_find_tree_block(root, block2, blocksize); @@ -1351,7 +1351,11 @@ static noinline int reada_for_balance(struct btrfs_root *root, } if (block1 || block2) { ret = -EAGAIN; + + /* release the whole path */ btrfs_release_path(root, path); + + /* read the blocks */ if (block1) readahead_tree_block(root, block1, blocksize, 0); if (block2) @@ -1361,7 +1365,7 @@ static noinline int reada_for_balance(struct btrfs_root *root, eb = read_tree_block(root, block1, blocksize, 0); free_extent_buffer(eb); } - if (block1) { + if (block2) { eb = read_tree_block(root, block2, blocksize, 0); free_extent_buffer(eb); } @@ -1447,6 +1451,120 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level) } /* + * helper function for btrfs_search_slot. The goal is to find a block + * in cache without setting the path to blocking. If we find the block + * we return zero and the path is unchanged. + * + * If we can't find the block, we set the path blocking and do some + * reada. -EAGAIN is returned and the search must be repeated. + */ +static int +read_block_for_search(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer **eb_ret, int level, int slot, + struct btrfs_key *key) +{ + u64 blocknr; + u64 gen; + u32 blocksize; + struct extent_buffer *b = *eb_ret; + struct extent_buffer *tmp; + + blocknr = btrfs_node_blockptr(b, slot); + gen = btrfs_node_ptr_generation(b, slot); + blocksize = btrfs_level_size(root, level - 1); + + tmp = btrfs_find_tree_block(root, blocknr, blocksize); + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + *eb_ret = tmp; + return 0; + } + + /* + * reduce lock contention at high levels + * of the btree by dropping locks before + * we read. + */ + btrfs_unlock_up_safe(p, level + 1); + btrfs_set_path_blocking(p); + + if (tmp) + free_extent_buffer(tmp); + if (p->reada) + reada_for_search(root, p, level, slot, key->objectid); + + btrfs_release_path(NULL, p); + tmp = read_tree_block(root, blocknr, blocksize, gen); + if (tmp) + free_extent_buffer(tmp); + return -EAGAIN; +} + +/* + * helper function for btrfs_search_slot. This does all of the checks + * for node-level blocks and does any balancing required based on + * the ins_len. + * + * If no extra work was required, zero is returned. If we had to + * drop the path, -EAGAIN is returned and btrfs_search_slot must + * start over + */ +static int +setup_nodes_for_search(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p, + struct extent_buffer *b, int level, int ins_len) +{ + int ret; + if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= + BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = split_node(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL); + + BUG_ON(sret > 0); + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + } else if (ins_len < 0 && btrfs_header_nritems(b) < + BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { + int sret; + + sret = reada_for_balance(root, p, level); + if (sret) + goto again; + + btrfs_set_path_blocking(p); + sret = balance_level(trans, root, p, level); + btrfs_clear_path_blocking(p, NULL); + + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + if (!b) { + btrfs_release_path(NULL, p); + goto again; + } + BUG_ON(btrfs_header_nritems(b) == 1); + } + return 0; + +again: + ret = -EAGAIN; +done: + return ret; +} + +/* * look for key in the tree. path is filled in with nodes along the way * if key is found, we return zero and you can find the item in the leaf * level of the path (level 0) @@ -1464,16 +1582,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root ins_len, int cow) { struct extent_buffer *b; - struct extent_buffer *tmp; int slot; int ret; int level; - int should_reada = p->reada; int lowest_unlock = 1; - int blocksize; u8 lowest_level = 0; - u64 blocknr; - u64 gen; lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); @@ -1502,7 +1615,11 @@ again: if (cow) { int wret; - /* is a cow on this block not required */ + /* + * if we don't really need to cow this block + * then we don't want to set the path blocking, + * so we test it here + */ if (btrfs_header_generation(b) == trans->transid && btrfs_header_owner(b) == root->root_key.objectid && !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { @@ -1557,51 +1674,15 @@ cow_done: if (ret && slot > 0) slot -= 1; p->slots[level] = slot; - if ((p->search_for_split || ins_len > 0) && - btrfs_header_nritems(b) >= - BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { - int sret; - - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - - btrfs_set_path_blocking(p); - sret = split_node(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL); - - BUG_ON(sret > 0); - if (sret) { - ret = sret; - goto done; - } - b = p->nodes[level]; - slot = p->slots[level]; - } else if (ins_len < 0 && - btrfs_header_nritems(b) < - BTRFS_NODEPTRS_PER_BLOCK(root) / 4) { - int sret; - - sret = reada_for_balance(root, p, level); - if (sret) - goto again; - - btrfs_set_path_blocking(p); - sret = balance_level(trans, root, p, level); - btrfs_clear_path_blocking(p, NULL); + ret = setup_nodes_for_search(trans, root, p, b, level, + ins_len); + if (ret == -EAGAIN) + goto again; + else if (ret) + goto done; + b = p->nodes[level]; + slot = p->slots[level]; - if (sret) { - ret = sret; - goto done; - } - b = p->nodes[level]; - if (!b) { - btrfs_release_path(NULL, p); - goto again; - } - slot = p->slots[level]; - BUG_ON(btrfs_header_nritems(b) == 1); - } unlock_up(p, level, lowest_unlock); /* this is only true while dropping a snapshot */ @@ -1610,44 +1691,11 @@ cow_done: goto done; } - blocknr = btrfs_node_blockptr(b, slot); - gen = btrfs_node_ptr_generation(b, slot); - blocksize = btrfs_level_size(root, level - 1); + ret = read_block_for_search(trans, root, p, + &b, level, slot, key); + if (ret == -EAGAIN) + goto again; - tmp = btrfs_find_tree_block(root, blocknr, blocksize); - if (tmp && btrfs_buffer_uptodate(tmp, gen)) { - b = tmp; - } else { - /* - * reduce lock contention at high levels - * of the btree by dropping locks before - * we read. - */ - if (level > 0) { - btrfs_release_path(NULL, p); - if (tmp) - free_extent_buffer(tmp); - if (should_reada) - reada_for_search(root, p, - level, slot, - key->objectid); - - tmp = read_tree_block(root, blocknr, - blocksize, gen); - if (tmp) - free_extent_buffer(tmp); - goto again; - } else { - btrfs_set_path_blocking(p); - if (tmp) - free_extent_buffer(tmp); - if (should_reada) - reada_for_search(root, p, - level, slot, - key->objectid); - b = read_node_slot(root, b, slot); - } - } if (!p->skip_locking) { int lret; @@ -2116,8 +2164,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root BUG_ON(!path->nodes[level]); lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); - if (slot > nritems) - BUG(); + BUG_ON(slot > nritems); if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) BUG(); if (slot != nritems) { @@ -4086,28 +4133,44 @@ next: int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) { int slot; - int level = 1; + int level; struct extent_buffer *c; - struct extent_buffer *next = NULL; + struct extent_buffer *next; struct btrfs_key key; u32 nritems; int ret; + int old_spinning = path->leave_spinning; + int force_blocking = 0; nritems = btrfs_header_nritems(path->nodes[0]); if (nritems == 0) return 1; - btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); + /* + * we take the blocks in an order that upsets lockdep. Using + * blocking mode is the only way around it. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + force_blocking = 1; +#endif + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); +again: + level = 1; + next = NULL; btrfs_release_path(root, path); + path->keep_locks = 1; + + if (!force_blocking) + path->leave_spinning = 1; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); path->keep_locks = 0; if (ret < 0) return ret; - btrfs_set_path_blocking(path); nritems = btrfs_header_nritems(path->nodes[0]); /* * by releasing the path above we dropped all our locks. A balance @@ -4117,19 +4180,24 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) */ if (nritems > 0 && path->slots[0] < nritems - 1) { path->slots[0]++; + ret = 0; goto done; } while (level < BTRFS_MAX_LEVEL) { - if (!path->nodes[level]) - return 1; + if (!path->nodes[level]) { + ret = 1; + goto done; + } slot = path->slots[level] + 1; c = path->nodes[level]; if (slot >= btrfs_header_nritems(c)) { level++; - if (level == BTRFS_MAX_LEVEL) - return 1; + if (level == BTRFS_MAX_LEVEL) { + ret = 1; + goto done; + } continue; } @@ -4138,16 +4206,22 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) free_extent_buffer(next); } - /* the path was set to blocking above */ - if (level == 1 && (path->locks[1] || path->skip_locking) && - path->reada) - reada_for_search(root, path, level, slot, 0); + next = c; + ret = read_block_for_search(NULL, root, path, &next, level, + slot, &key); + if (ret == -EAGAIN) + goto again; - next = read_node_slot(root, c, slot); if (!path->skip_locking) { - btrfs_assert_tree_locked(c); - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + ret = btrfs_try_spin_lock(next); + if (!ret) { + btrfs_set_path_blocking(path); + btrfs_tree_lock(next); + if (!force_blocking) + btrfs_clear_path_blocking(path, next); + } + if (force_blocking) + btrfs_set_lock_blocking(next); } break; } @@ -4157,27 +4231,42 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) c = path->nodes[level]; if (path->locks[level]) btrfs_tree_unlock(c); + free_extent_buffer(c); path->nodes[level] = next; path->slots[level] = 0; if (!path->skip_locking) path->locks[level] = 1; + if (!level) break; - btrfs_set_path_blocking(path); - if (level == 1 && path->locks[1] && path->reada) - reada_for_search(root, path, level, slot, 0); - next = read_node_slot(root, next, 0); + ret = read_block_for_search(NULL, root, path, &next, level, + 0, &key); + if (ret == -EAGAIN) + goto again; + if (!path->skip_locking) { btrfs_assert_tree_locked(path->nodes[level]); - btrfs_tree_lock(next); - btrfs_set_lock_blocking(next); + ret = btrfs_try_spin_lock(next); + if (!ret) { + btrfs_set_path_blocking(path); + btrfs_tree_lock(next); + if (!force_blocking) + btrfs_clear_path_blocking(path, next); + } + if (force_blocking) + btrfs_set_lock_blocking(next); } } + ret = 0; done: unlock_up(path, 0, 1); - return 0; + path->leave_spinning = old_spinning; + if (!old_spinning) + btrfs_set_path_blocking(path); + + return ret; } /* diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9417713542a2..4414a5d9983a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -143,12 +143,15 @@ static int btrfs_csum_sizes[] = { 4, 0 }; #define BTRFS_FT_MAX 9 /* - * the key defines the order in the tree, and so it also defines (optimal) - * block layout. objectid corresonds to the inode number. The flags - * tells us things about the object, and is a kind of stream selector. - * so for a given inode, keys with flags of 1 might refer to the inode - * data, flags of 2 may point to file data in the btree and flags == 3 - * may point to extents. + * The key defines the order in the tree, and so it also defines (optimal) + * block layout. + * + * objectid corresponds to the inode number. + * + * type tells us things about the object, and is a kind of stream selector. + * so for a given inode, keys with type of 1 might refer to the inode data, + * type of 2 may point to file data in the btree and type == 3 may point to + * extents. * * offset is the starting byte offset for this key in the stream. * @@ -200,7 +203,7 @@ struct btrfs_dev_item { /* * starting byte of this partition on the device, - * to allowr for stripe alignment in the future + * to allow for stripe alignment in the future */ __le64 start_offset; @@ -633,18 +636,35 @@ struct btrfs_space_info { struct rw_semaphore groups_sem; }; -struct btrfs_free_space { - struct rb_node bytes_index; - struct rb_node offset_index; - u64 offset; - u64 bytes; +/* + * free clusters are used to claim free space in relatively large chunks, + * allowing us to do less seeky writes. They are used for all metadata + * allocations and data allocations in ssd mode. + */ +struct btrfs_free_cluster { + spinlock_t lock; + spinlock_t refill_lock; + struct rb_root root; + + /* largest extent in this cluster */ + u64 max_size; + + /* first extent starting offset */ + u64 window_start; + + struct btrfs_block_group_cache *block_group; + /* + * when a cluster is allocated from a block group, we put the + * cluster onto a list in the block group so that it can + * be freed before the block group is freed. + */ + struct list_head block_group_list; }; struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; spinlock_t lock; - struct mutex alloc_mutex; struct mutex cache_mutex; u64 pinned; u64 reserved; @@ -656,6 +676,7 @@ struct btrfs_block_group_cache { struct btrfs_space_info *space_info; /* free space cache stuff */ + spinlock_t tree_lock; struct rb_root free_space_bytes; struct rb_root free_space_offset; @@ -667,6 +688,11 @@ struct btrfs_block_group_cache { /* usage count */ atomic_t count; + + /* List of struct btrfs_free_clusters for this block group. + * Today it will only have one thing on it, but that may change + */ + struct list_head cluster_list; }; struct btrfs_leaf_ref_tree { @@ -728,7 +754,6 @@ struct btrfs_fs_info { struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; - struct mutex pinned_mutex; struct mutex chunk_mutex; struct mutex drop_mutex; struct mutex volume_mutex; @@ -839,8 +864,12 @@ struct btrfs_fs_info { spinlock_t delalloc_lock; spinlock_t new_trans_lock; u64 delalloc_bytes; - u64 last_alloc; - u64 last_data_alloc; + + /* data_alloc_cluster is only used in ssd mode */ + struct btrfs_free_cluster data_alloc_cluster; + + /* all metadata allocations go through this cluster */ + struct btrfs_free_cluster meta_alloc_cluster; spinlock_t ref_cache_lock; u64 total_ref_cache_size; @@ -852,6 +881,9 @@ struct btrfs_fs_info { u64 metadata_alloc_profile; u64 system_alloc_profile; + unsigned data_chunk_allocations; + unsigned metadata_ratio; + void *bdev_holder; }; @@ -932,7 +964,6 @@ struct btrfs_root { }; /* - * inode items have the data typically returned from stat and store other * info about object characteristics. There is one for every file and dir in * the FS @@ -963,7 +994,7 @@ struct btrfs_root { #define BTRFS_EXTENT_CSUM_KEY 128 /* - * root items point to tree roots. There are typically in the root + * root items point to tree roots. They are typically in the root * tree used by the super block to find all the other trees */ #define BTRFS_ROOT_ITEM_KEY 132 @@ -1010,6 +1041,8 @@ struct btrfs_root { #define BTRFS_MOUNT_SSD (1 << 3) #define BTRFS_MOUNT_DEGRADED (1 << 4) #define BTRFS_MOUNT_COMPRESS (1 << 5) +#define BTRFS_MOUNT_NOTREELOG (1 << 6) +#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1748,6 +1781,7 @@ static inline struct dentry *fdentry(struct file *file) } /* extent-tree.c */ +void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root, unsigned long count); int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); @@ -2143,7 +2177,8 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode); extern struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - u64 start, u64 end, u64 inline_limit, u64 *hint_block); + u64 start, u64 end, u64 locked_end, + u64 inline_limit, u64 *hint_block); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end); @@ -2174,21 +2209,4 @@ int btrfs_check_acl(struct inode *inode, int mask); int btrfs_init_acl(struct inode *inode, struct inode *dir); int btrfs_acl_chmod(struct inode *inode); -/* free-space-cache.c */ -int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 bytenr, u64 size); -int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes); -int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, - u64 bytenr, u64 size); -int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes); -void btrfs_remove_free_space_cache(struct btrfs_block_group_cache - *block_group); -struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes); -void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, - u64 bytes); -u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); #endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index cbf7dc8ae3ec..d6c01c096a40 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -18,7 +18,6 @@ #include <linux/sched.h> #include <linux/sort.h> -#include <linux/ftrace.h> #include "ctree.h" #include "delayed-ref.h" #include "transaction.h" diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 92d73929d381..0ff16d3331da 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -38,6 +38,7 @@ #include "locking.h" #include "ref-cache.h" #include "tree-log.h" +#include "free-space-cache.h" static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); @@ -231,10 +232,14 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, memcpy(&found, result, csum_size); read_extent_buffer(buf, &val, 0, csum_size); - printk(KERN_INFO "btrfs: %s checksum verify failed " - "on %llu wanted %X found %X level %d\n", - root->fs_info->sb->s_id, - buf->start, val, found, btrfs_header_level(buf)); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs: %s checksum verify " + "failed on %llu wanted %X found %X " + "level %d\n", + root->fs_info->sb->s_id, + (unsigned long long)buf->start, val, found, + btrfs_header_level(buf)); + } if (result != (char *)&inline_result) kfree(result); return 1; @@ -267,10 +272,13 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, ret = 0; goto out; } - printk("parent transid verify failed on %llu wanted %llu found %llu\n", - (unsigned long long)eb->start, - (unsigned long long)parent_transid, - (unsigned long long)btrfs_header_generation(eb)); + if (printk_ratelimit()) { + printk("parent transid verify failed on %llu wanted %llu " + "found %llu\n", + (unsigned long long)eb->start, + (unsigned long long)parent_transid, + (unsigned long long)btrfs_header_generation(eb)); + } ret = 1; clear_extent_buffer_uptodate(io_tree, eb); out: @@ -414,9 +422,12 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, found_start = btrfs_header_bytenr(eb); if (found_start != start) { - printk(KERN_INFO "btrfs bad tree block start %llu %llu\n", - (unsigned long long)found_start, - (unsigned long long)eb->start); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs bad tree block start " + "%llu %llu\n", + (unsigned long long)found_start, + (unsigned long long)eb->start); + } ret = -EIO; goto err; } @@ -428,8 +439,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, goto err; } if (check_tree_block_fsid(root, eb)) { - printk(KERN_INFO "btrfs bad fsid on block %llu\n", - (unsigned long long)eb->start); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs bad fsid on block %llu\n", + (unsigned long long)eb->start); + } ret = -EIO; goto err; } @@ -578,19 +591,12 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->bio_flags = bio_flags; atomic_inc(&fs_info->nr_async_submits); + + if (rw & (1 << BIO_RW_SYNCIO)) + btrfs_set_work_high_prio(&async->work); + btrfs_queue_worker(&fs_info->workers, &async->work); -#if 0 - int limit = btrfs_async_submit_limit(fs_info); - if (atomic_read(&fs_info->nr_async_submits) > limit) { - wait_event_timeout(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_submits) < limit), - HZ/10); - wait_event_timeout(fs_info->async_submit_wait, - (atomic_read(&fs_info->nr_async_bios) < limit), - HZ/10); - } -#endif while (atomic_read(&fs_info->async_submit_draining) && atomic_read(&fs_info->nr_async_submits)) { wait_event(fs_info->async_submit_wait, @@ -655,6 +661,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 0); } + /* * kthread helpers are used to submit writes so that checksumming * can happen in parallel across all CPUs @@ -764,27 +771,6 @@ static void btree_invalidatepage(struct page *page, unsigned long offset) } } -#if 0 -static int btree_writepage(struct page *page, struct writeback_control *wbc) -{ - struct buffer_head *bh; - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; - struct buffer_head *head; - if (!page_has_buffers(page)) { - create_empty_buffers(page, root->fs_info->sb->s_blocksize, - (1 << BH_Dirty)|(1 << BH_Uptodate)); - } - head = page_buffers(page); - bh = head; - do { - if (buffer_dirty(bh)) - csum_tree_block(root, bh, 0); - bh = bh->b_this_page; - } while (bh != head); - return block_write_full_page(page, btree_get_block, wbc); -} -#endif - static struct address_space_operations btree_aops = { .readpage = btree_readpage, .writepage = btree_writepage, @@ -1272,11 +1258,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) int ret = 0; struct btrfs_device *device; struct backing_dev_info *bdi; -#if 0 - if ((bdi_bits & (1 << BDI_write_congested)) && - btrfs_congested_async(info, 0)) - return 1; -#endif + list_for_each_entry(device, &info->fs_devices->devices, dev_list) { if (!device->bdev) continue; @@ -1412,8 +1394,6 @@ static int bio_ready_for_csum(struct bio *bio) ret = extent_range_uptodate(io_tree, start + length, start + buf_len - 1); - if (ret == 1) - return ret; return ret; } @@ -1600,6 +1580,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->btree_inode = new_inode(sb); fs_info->btree_inode->i_ino = 1; fs_info->btree_inode->i_nlink = 1; + fs_info->metadata_ratio = 8; fs_info->thread_pool_size = min_t(unsigned long, num_online_cpus() + 2, 8); @@ -1647,12 +1628,15 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); mutex_init(&fs_info->drop_mutex); - mutex_init(&fs_info->pinned_mutex); mutex_init(&fs_info->chunk_mutex); mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); mutex_init(&fs_info->tree_reloc_mutex); + + btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); + btrfs_init_free_cluster(&fs_info->data_alloc_cluster); + init_waitqueue_head(&fs_info->transaction_throttle); init_waitqueue_head(&fs_info->transaction_wait); init_waitqueue_head(&fs_info->async_submit_wait); @@ -1687,7 +1671,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (features) { printk(KERN_ERR "BTRFS: couldn't mount because of " "unsupported optional features (%Lx).\n", - features); + (unsigned long long)features); err = -EINVAL; goto fail_iput; } @@ -1697,7 +1681,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!(sb->s_flags & MS_RDONLY) && features) { printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " "unsupported option features (%Lx).\n", - features); + (unsigned long long)features); err = -EINVAL; goto fail_iput; } @@ -2093,10 +2077,10 @@ static int write_dev_supers(struct btrfs_device *device, device->barriers = 0; get_bh(bh); lock_buffer(bh); - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); } } else { - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); } if (!ret && wait) { @@ -2289,7 +2273,7 @@ int close_ctree(struct btrfs_root *root) if (fs_info->delalloc_bytes) { printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", - fs_info->delalloc_bytes); + (unsigned long long)fs_info->delalloc_bytes); } if (fs_info->total_ref_cache_size) { printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", @@ -2326,16 +2310,6 @@ int close_ctree(struct btrfs_root *root) btrfs_stop_workers(&fs_info->endio_write_workers); btrfs_stop_workers(&fs_info->submit_workers); -#if 0 - while (!list_empty(&fs_info->hashers)) { - struct btrfs_hasher *hasher; - hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher, - hashers); - list_del(&hasher->hashers); - crypto_free_hash(&fs_info->hash_tfm); - kfree(hasher); - } -#endif btrfs_close_devices(fs_info->fs_devices); btrfs_mapping_tree_free(&fs_info->mapping_tree); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f5e7cae63d80..e4966444811b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -31,6 +31,7 @@ #include "volumes.h" #include "locking.h" #include "ref-cache.h" +#include "free-space-cache.h" #define PENDING_EXTENT_INSERT 0 #define PENDING_EXTENT_DELETE 1 @@ -166,7 +167,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group, u64 extent_start, extent_end, size; int ret; - mutex_lock(&info->pinned_mutex); while (start < end) { ret = find_first_extent_bit(&info->pinned_extents, start, &extent_start, &extent_end, @@ -192,7 +192,6 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group, ret = btrfs_add_free_space(block_group, start, size); BUG_ON(ret); } - mutex_unlock(&info->pinned_mutex); return 0; } @@ -291,8 +290,8 @@ next: block_group->key.objectid + block_group->key.offset); - remove_sb_from_cache(root, block_group); block_group->cached = 1; + remove_sb_from_cache(root, block_group); ret = 0; err: btrfs_free_path(path); @@ -326,7 +325,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( return cache; } -static inline void put_block_group(struct btrfs_block_group_cache *cache) +void btrfs_put_block_group(struct btrfs_block_group_cache *cache) { if (atomic_dec_and_test(&cache->count)) kfree(cache); @@ -399,12 +398,12 @@ again: div_factor(cache->key.offset, factor)) { group_start = cache->key.objectid; spin_unlock(&cache->lock); - put_block_group(cache); + btrfs_put_block_group(cache); goto found; } } spin_unlock(&cache->lock); - put_block_group(cache); + btrfs_put_block_group(cache); cond_resched(); } if (!wrapped) { @@ -1594,7 +1593,7 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) if (!block_group || block_group->ro) readonly = 1; if (block_group) - put_block_group(block_group); + btrfs_put_block_group(block_group); return readonly; } @@ -1845,10 +1844,14 @@ again: printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" ", %llu bytes_used, %llu bytes_reserved, " "%llu bytes_pinned, %llu bytes_readonly, %llu may use" - "%llu total\n", bytes, data_sinfo->bytes_delalloc, - data_sinfo->bytes_used, data_sinfo->bytes_reserved, - data_sinfo->bytes_pinned, data_sinfo->bytes_readonly, - data_sinfo->bytes_may_use, data_sinfo->total_bytes); + "%llu total\n", (unsigned long long)bytes, + (unsigned long long)data_sinfo->bytes_delalloc, + (unsigned long long)data_sinfo->bytes_used, + (unsigned long long)data_sinfo->bytes_reserved, + (unsigned long long)data_sinfo->bytes_pinned, + (unsigned long long)data_sinfo->bytes_readonly, + (unsigned long long)data_sinfo->bytes_may_use, + (unsigned long long)data_sinfo->total_bytes); return -ENOSPC; } data_sinfo->bytes_may_use += bytes; @@ -1919,15 +1922,29 @@ void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, spin_unlock(&info->lock); } +static void force_metadata_allocation(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags & BTRFS_BLOCK_GROUP_METADATA) + found->force_alloc = 1; + } + rcu_read_unlock(); +} + static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force) { struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = extent_root->fs_info; u64 thresh; int ret = 0; - mutex_lock(&extent_root->fs_info->chunk_mutex); + mutex_lock(&fs_info->chunk_mutex); flags = btrfs_reduce_alloc_profile(extent_root, flags); @@ -1959,6 +1976,18 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, } spin_unlock(&space_info->lock); + /* + * if we're doing a data chunk, go ahead and make sure that + * we keep a reasonable number of metadata chunks allocated in the + * FS as well. + */ + if (flags & BTRFS_BLOCK_GROUP_DATA) { + fs_info->data_chunk_allocations++; + if (!(fs_info->data_chunk_allocations % + fs_info->metadata_ratio)) + force_metadata_allocation(fs_info); + } + ret = btrfs_alloc_chunk(trans, extent_root, flags); if (ret) space_info->full = 1; @@ -2018,7 +2047,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, WARN_ON(ret); } } - put_block_group(cache); + btrfs_put_block_group(cache); total -= num_bytes; bytenr += num_bytes; } @@ -2035,7 +2064,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) return 0; bytenr = cache->key.objectid; - put_block_group(cache); + btrfs_put_block_group(cache); return bytenr; } @@ -2047,7 +2076,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, struct btrfs_block_group_cache *cache; struct btrfs_fs_info *fs_info = root->fs_info; - WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex)); if (pin) { set_extent_dirty(&fs_info->pinned_extents, bytenr, bytenr + num - 1, GFP_NOFS); @@ -2055,7 +2083,6 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, clear_extent_dirty(&fs_info->pinned_extents, bytenr, bytenr + num - 1, GFP_NOFS); } - mutex_unlock(&root->fs_info->pinned_mutex); while (num > 0) { cache = btrfs_lookup_block_group(fs_info, bytenr); @@ -2081,7 +2108,7 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, if (cache->cached) btrfs_add_free_space(cache, bytenr, len); } - put_block_group(cache); + btrfs_put_block_group(cache); bytenr += len; num -= len; } @@ -2112,7 +2139,7 @@ static int update_reserved_extents(struct btrfs_root *root, } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - put_block_group(cache); + btrfs_put_block_group(cache); bytenr += len; num -= len; } @@ -2127,7 +2154,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; int ret; - mutex_lock(&root->fs_info->pinned_mutex); while (1) { ret = find_first_extent_bit(pinned_extents, last, &start, &end, EXTENT_DIRTY); @@ -2136,7 +2162,6 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) set_extent_dirty(copy, start, end, GFP_NOFS); last = end + 1; } - mutex_unlock(&root->fs_info->pinned_mutex); return 0; } @@ -2149,7 +2174,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, int ret; while (1) { - mutex_lock(&root->fs_info->pinned_mutex); ret = find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY); if (ret) @@ -2163,7 +2187,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } - mutex_unlock(&root->fs_info->pinned_mutex); return ret; } @@ -2205,7 +2228,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans, free_extent_buffer(buf); pinit: btrfs_set_path_blocking(path); - mutex_lock(&root->fs_info->pinned_mutex); /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); @@ -2511,8 +2533,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, */ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID && owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { - mutex_lock(&root->fs_info->pinned_mutex); - /* unlocks the pinned mutex */ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); update_reserved_extents(root, bytenr, num_bytes, 0); @@ -2554,228 +2574,237 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, { int ret = 0; struct btrfs_root *root = orig_root->fs_info->extent_root; - u64 total_needed = num_bytes; - u64 *last_ptr = NULL; - u64 last_wanted = 0; + struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; - int chunk_alloc_done = 0; int empty_cluster = 2 * 1024 * 1024; int allowed_chunk_alloc = 0; - struct list_head *head = NULL, *cur = NULL; - int loop = 0; - int extra_loop = 0; struct btrfs_space_info *space_info; + int last_ptr_loop = 0; + int loop = 0; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); ins->objectid = 0; ins->offset = 0; + space_info = __find_space_info(root->fs_info, data); + if (orig_root->ref_cows || empty_size) allowed_chunk_alloc = 1; if (data & BTRFS_BLOCK_GROUP_METADATA) { - last_ptr = &root->fs_info->last_alloc; + last_ptr = &root->fs_info->meta_alloc_cluster; if (!btrfs_test_opt(root, SSD)) empty_cluster = 64 * 1024; } - if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) - last_ptr = &root->fs_info->last_data_alloc; + if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { + last_ptr = &root->fs_info->data_alloc_cluster; + } if (last_ptr) { - if (*last_ptr) { - hint_byte = *last_ptr; - last_wanted = *last_ptr; - } else - empty_size += empty_cluster; - } else { - empty_cluster = 0; + spin_lock(&last_ptr->lock); + if (last_ptr->block_group) + hint_byte = last_ptr->window_start; + spin_unlock(&last_ptr->lock); } + search_start = max(search_start, first_logical_byte(root, 0)); search_start = max(search_start, hint_byte); - if (last_wanted && search_start != last_wanted) { - last_wanted = 0; - empty_size += empty_cluster; + if (!last_ptr) { + empty_cluster = 0; + loop = 1; } - total_needed += empty_size; - block_group = btrfs_lookup_block_group(root->fs_info, search_start); - if (!block_group) - block_group = btrfs_lookup_first_block_group(root->fs_info, - search_start); - space_info = __find_space_info(root->fs_info, data); + if (search_start == hint_byte) { + block_group = btrfs_lookup_block_group(root->fs_info, + search_start); + if (block_group && block_group_bits(block_group, data)) { + down_read(&space_info->groups_sem); + goto have_block_group; + } else if (block_group) { + btrfs_put_block_group(block_group); + } + } +search: down_read(&space_info->groups_sem); - while (1) { - struct btrfs_free_space *free_space; - /* - * the only way this happens if our hint points to a block - * group thats not of the proper type, while looping this - * should never happen - */ - if (empty_size) - extra_loop = 1; + list_for_each_entry(block_group, &space_info->block_groups, list) { + u64 offset; - if (!block_group) - goto new_group_no_lock; + atomic_inc(&block_group->count); + search_start = block_group->key.objectid; +have_block_group: if (unlikely(!block_group->cached)) { mutex_lock(&block_group->cache_mutex); ret = cache_block_group(root, block_group); mutex_unlock(&block_group->cache_mutex); - if (ret) + if (ret) { + btrfs_put_block_group(block_group); break; + } } - mutex_lock(&block_group->alloc_mutex); - if (unlikely(!block_group_bits(block_group, data))) - goto new_group; - if (unlikely(block_group->ro)) - goto new_group; + goto loop; - free_space = btrfs_find_free_space(block_group, search_start, - total_needed); - if (free_space) { - u64 start = block_group->key.objectid; - u64 end = block_group->key.objectid + - block_group->key.offset; + if (last_ptr) { + /* + * the refill lock keeps out other + * people trying to start a new cluster + */ + spin_lock(&last_ptr->refill_lock); + offset = btrfs_alloc_from_cluster(block_group, last_ptr, + num_bytes, search_start); + if (offset) { + /* we have a block, we're done */ + spin_unlock(&last_ptr->refill_lock); + goto checks; + } - search_start = stripe_align(root, free_space->offset); + spin_lock(&last_ptr->lock); + /* + * whoops, this cluster doesn't actually point to + * this block group. Get a ref on the block + * group is does point to and try again + */ + if (!last_ptr_loop && last_ptr->block_group && + last_ptr->block_group != block_group) { + + btrfs_put_block_group(block_group); + block_group = last_ptr->block_group; + atomic_inc(&block_group->count); + spin_unlock(&last_ptr->lock); + spin_unlock(&last_ptr->refill_lock); + + last_ptr_loop = 1; + search_start = block_group->key.objectid; + goto have_block_group; + } + spin_unlock(&last_ptr->lock); - /* move on to the next group */ - if (search_start + num_bytes >= search_end) - goto new_group; + /* + * this cluster didn't work out, free it and + * start over + */ + btrfs_return_cluster_to_free_space(NULL, last_ptr); - /* move on to the next group */ - if (search_start + num_bytes > end) - goto new_group; + last_ptr_loop = 0; - if (last_wanted && search_start != last_wanted) { - total_needed += empty_cluster; - empty_size += empty_cluster; - last_wanted = 0; + /* allocate a cluster in this block group */ + ret = btrfs_find_space_cluster(trans, + block_group, last_ptr, + offset, num_bytes, + empty_cluster + empty_size); + if (ret == 0) { /* - * if search_start is still in this block group - * then we just re-search this block group + * now pull our allocation out of this + * cluster */ - if (search_start >= start && - search_start < end) { - mutex_unlock(&block_group->alloc_mutex); - continue; + offset = btrfs_alloc_from_cluster(block_group, + last_ptr, num_bytes, + search_start); + if (offset) { + /* we found one, proceed */ + spin_unlock(&last_ptr->refill_lock); + goto checks; } - - /* else we go to the next block group */ - goto new_group; } - - if (exclude_nr > 0 && - (search_start + num_bytes > exclude_start && - search_start < exclude_start + exclude_nr)) { - search_start = exclude_start + exclude_nr; - /* - * if search_start is still in this block group - * then we just re-search this block group - */ - if (search_start >= start && - search_start < end) { - mutex_unlock(&block_group->alloc_mutex); - last_wanted = 0; - continue; - } - - /* else we go to the next block group */ - goto new_group; + /* + * at this point we either didn't find a cluster + * or we weren't able to allocate a block from our + * cluster. Free the cluster we've been trying + * to use, and go to the next block group + */ + if (loop < 2) { + btrfs_return_cluster_to_free_space(NULL, + last_ptr); + spin_unlock(&last_ptr->refill_lock); + goto loop; } + spin_unlock(&last_ptr->refill_lock); + } - ins->objectid = search_start; - ins->offset = num_bytes; + offset = btrfs_find_space_for_alloc(block_group, search_start, + num_bytes, empty_size); + if (!offset) + goto loop; +checks: + search_start = stripe_align(root, offset); + + /* move on to the next group */ + if (search_start + num_bytes >= search_end) { + btrfs_add_free_space(block_group, offset, num_bytes); + goto loop; + } - btrfs_remove_free_space_lock(block_group, search_start, - num_bytes); - /* we are all good, lets return */ - mutex_unlock(&block_group->alloc_mutex); - break; + /* move on to the next group */ + if (search_start + num_bytes > + block_group->key.objectid + block_group->key.offset) { + btrfs_add_free_space(block_group, offset, num_bytes); + goto loop; } -new_group: - mutex_unlock(&block_group->alloc_mutex); - put_block_group(block_group); - block_group = NULL; -new_group_no_lock: - /* don't try to compare new allocations against the - * last allocation any more - */ - last_wanted = 0; - /* - * Here's how this works. - * loop == 0: we were searching a block group via a hint - * and didn't find anything, so we start at - * the head of the block groups and keep searching - * loop == 1: we're searching through all of the block groups - * if we hit the head again we have searched - * all of the block groups for this space and we - * need to try and allocate, if we cant error out. - * loop == 2: we allocated more space and are looping through - * all of the block groups again. - */ - if (loop == 0) { - head = &space_info->block_groups; - cur = head->next; - loop++; - } else if (loop == 1 && cur == head) { - int keep_going; - - /* at this point we give up on the empty_size - * allocations and just try to allocate the min - * space. - * - * The extra_loop field was set if an empty_size - * allocation was attempted above, and if this - * is try we need to try the loop again without - * the additional empty_size. + if (exclude_nr > 0 && + (search_start + num_bytes > exclude_start && + search_start < exclude_start + exclude_nr)) { + search_start = exclude_start + exclude_nr; + + btrfs_add_free_space(block_group, offset, num_bytes); + /* + * if search_start is still in this block group + * then we just re-search this block group */ - total_needed -= empty_size; - empty_size = 0; - keep_going = extra_loop; - loop++; + if (search_start >= block_group->key.objectid && + search_start < (block_group->key.objectid + + block_group->key.offset)) + goto have_block_group; + goto loop; + } - if (allowed_chunk_alloc && !chunk_alloc_done) { - up_read(&space_info->groups_sem); - ret = do_chunk_alloc(trans, root, num_bytes + - 2 * 1024 * 1024, data, 1); - down_read(&space_info->groups_sem); - if (ret < 0) - goto loop_check; - head = &space_info->block_groups; - /* - * we've allocated a new chunk, keep - * trying - */ - keep_going = 1; - chunk_alloc_done = 1; - } else if (!allowed_chunk_alloc) { - space_info->force_alloc = 1; - } -loop_check: - if (keep_going) { - cur = head->next; - extra_loop = 0; - } else { - break; - } - } else if (cur == head) { - break; + ins->objectid = search_start; + ins->offset = num_bytes; + + if (offset < search_start) + btrfs_add_free_space(block_group, offset, + search_start - offset); + BUG_ON(offset > search_start); + + /* we are all good, lets return */ + break; +loop: + btrfs_put_block_group(block_group); + } + up_read(&space_info->groups_sem); + + /* loop == 0, try to find a clustered alloc in every block group + * loop == 1, try again after forcing a chunk allocation + * loop == 2, set empty_size and empty_cluster to 0 and try again + */ + if (!ins->objectid && loop < 3 && + (empty_size || empty_cluster || allowed_chunk_alloc)) { + if (loop >= 2) { + empty_size = 0; + empty_cluster = 0; } - block_group = list_entry(cur, struct btrfs_block_group_cache, - list); - atomic_inc(&block_group->count); + if (allowed_chunk_alloc) { + ret = do_chunk_alloc(trans, root, num_bytes + + 2 * 1024 * 1024, data, 1); + allowed_chunk_alloc = 0; + } else { + space_info->force_alloc = 1; + } - search_start = block_group->key.objectid; - cur = cur->next; + if (loop < 3) { + loop++; + goto search; + } + ret = -ENOSPC; + } else if (!ins->objectid) { + ret = -ENOSPC; } /* we found what we needed */ @@ -2783,21 +2812,10 @@ loop_check: if (!(data & BTRFS_BLOCK_GROUP_DATA)) trans->block_group = block_group->key.objectid; - if (last_ptr) - *last_ptr = ins->objectid + ins->offset; + btrfs_put_block_group(block_group); ret = 0; - } else if (!ret) { - printk(KERN_ERR "btrfs searching for %llu bytes, " - "num_bytes %llu, loop %d, allowed_alloc %d\n", - (unsigned long long)total_needed, - (unsigned long long)num_bytes, - loop, allowed_chunk_alloc); - ret = -ENOSPC; } - if (block_group) - put_block_group(block_group); - up_read(&space_info->groups_sem); return ret; } @@ -2810,9 +2828,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes) info->bytes_pinned - info->bytes_reserved), (info->full) ? "" : "not "); printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," - " may_use=%llu, used=%llu\n", info->total_bytes, - info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use, - info->bytes_used); + " may_use=%llu, used=%llu\n", + (unsigned long long)info->total_bytes, + (unsigned long long)info->bytes_pinned, + (unsigned long long)info->bytes_delalloc, + (unsigned long long)info->bytes_may_use, + (unsigned long long)info->bytes_used); down_read(&info->groups_sem); list_for_each_entry(cache, &info->block_groups, list) { @@ -2902,7 +2923,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) ret = btrfs_discard_extent(root, start, len); btrfs_add_free_space(cache, start, len); - put_block_group(cache); + btrfs_put_block_group(cache); update_reserved_extents(root, start, len, 0); return ret; @@ -3040,7 +3061,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset); BUG_ON(ret); - put_block_group(block_group); + btrfs_put_block_group(block_group); ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, ref_generation, owner, ins, 1); return ret; @@ -5729,7 +5750,7 @@ next: WARN_ON(block_group->reserved > 0); WARN_ON(btrfs_block_group_used(&block_group->item) > 0); spin_unlock(&block_group->lock); - put_block_group(block_group); + btrfs_put_block_group(block_group); ret = 0; out: btrfs_free_path(path); @@ -5856,9 +5877,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); - mutex_init(&cache->alloc_mutex); + spin_lock_init(&cache->tree_lock); mutex_init(&cache->cache_mutex); INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); @@ -5912,9 +5934,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); - mutex_init(&cache->alloc_mutex); + spin_lock_init(&cache->tree_lock); mutex_init(&cache->cache_mutex); INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); btrfs_set_block_group_used(&cache->item, bytes_used); btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); @@ -5974,8 +5997,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&block_group->space_info->lock); block_group->space_info->full = 0; - put_block_group(block_group); - put_block_group(block_group); + btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 08085af089e2..fe9eb990e443 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -17,12 +17,6 @@ #include "ctree.h" #include "btrfs_inode.h" -/* temporary define until extent_map moves out of btrfs */ -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *, struct kmem_cache *, - unsigned long)); - static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -50,20 +44,23 @@ struct extent_page_data { /* tells writepage not to lock the state bits for this range * it still does the unlocking */ - int extent_locked; + unsigned int extent_locked:1; + + /* tells the submit_bio code to use a WRITE_SYNC */ + unsigned int sync_io:1; }; int __init extent_io_init(void) { - extent_state_cache = btrfs_cache_create("extent_state", - sizeof(struct extent_state), 0, - NULL); + extent_state_cache = kmem_cache_create("extent_state", + sizeof(struct extent_state), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_state_cache) return -ENOMEM; - extent_buffer_cache = btrfs_cache_create("extent_buffers", - sizeof(struct extent_buffer), 0, - NULL); + extent_buffer_cache = kmem_cache_create("extent_buffers", + sizeof(struct extent_buffer), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_buffer_cache) goto free_state_cache; return 0; @@ -1404,69 +1401,6 @@ out: return total_bytes; } -#if 0 -/* - * helper function to lock both pages and extents in the tree. - * pages must be locked first. - */ -static int lock_range(struct extent_io_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - int err; - - while (index <= end_index) { - page = grab_cache_page(tree->mapping, index); - if (!page) { - err = -ENOMEM; - goto failed; - } - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto failed; - } - index++; - } - lock_extent(tree, start, end, GFP_NOFS); - return 0; - -failed: - /* - * we failed above in getting the page at 'index', so we undo here - * up to but not including the page at 'index' - */ - end_index = index; - index = start >> PAGE_CACHE_SHIFT; - while (index < end_index) { - page = find_get_page(tree->mapping, index); - unlock_page(page); - page_cache_release(page); - index++; - } - return err; -} - -/* - * helper function to unlock both pages and extents in the tree. - */ -static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end) -{ - unsigned long index = start >> PAGE_CACHE_SHIFT; - unsigned long end_index = end >> PAGE_CACHE_SHIFT; - struct page *page; - - while (index <= end_index) { - page = find_get_page(tree->mapping, index); - unlock_page(page); - page_cache_release(page); - index++; - } - unlock_extent(tree, start, end, GFP_NOFS); - return 0; -} -#endif - /* * set the private field for a given byte offset in the tree. If there isn't * an extent_state there already, this does nothing. @@ -2101,6 +2035,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, return ret; } +static noinline void update_nr_written(struct page *page, + struct writeback_control *wbc, + unsigned long nr_written) +{ + wbc->nr_to_write -= nr_written; + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && + wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) + page->mapping->writeback_index = page->index + nr_written; +} + /* * the writepage semantics are similar to regular writepage. extent * records are inserted to lock ranges in the tree, and as dirty areas @@ -2136,8 +2080,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, u64 delalloc_end; int page_started; int compressed; + int write_flags; unsigned long nr_written = 0; + if (wbc->sync_mode == WB_SYNC_ALL) + write_flags = WRITE_SYNC_PLUG; + else + write_flags = WRITE; + WARN_ON(!PageLocked(page)); pg_offset = i_size & (PAGE_CACHE_SIZE - 1); if (page->index > end_index || @@ -2164,6 +2114,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, delalloc_end = 0; page_started = 0; if (!epd->extent_locked) { + /* + * make sure the wbc mapping index is at least updated + * to this page. + */ + update_nr_written(page, wbc, 0); + while (delalloc_end < page_end) { nr_delalloc = find_lock_delalloc_range(inode, tree, page, @@ -2185,7 +2141,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, */ if (page_started) { ret = 0; - goto update_nr_written; + /* + * we've unlocked the page, so we can't update + * the mapping's writeback index, just update + * nr_to_write. + */ + wbc->nr_to_write -= nr_written; + goto done_unlocked; } } lock_extent(tree, start, page_end, GFP_NOFS); @@ -2198,13 +2160,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, if (ret == -EAGAIN) { unlock_extent(tree, start, page_end, GFP_NOFS); redirty_page_for_writepage(wbc, page); + update_nr_written(page, wbc, nr_written); unlock_page(page); ret = 0; - goto update_nr_written; + goto done_unlocked; } } - nr_written++; + /* + * we don't want to touch the inode after unlocking the page, + * so we update the mapping writeback index now + */ + update_nr_written(page, wbc, nr_written + 1); end = page_end; if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) @@ -2314,9 +2281,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, (unsigned long long)end); } - ret = submit_extent_page(WRITE, tree, page, sector, - iosize, pg_offset, bdev, - &epd->bio, max_nr, + ret = submit_extent_page(write_flags, tree, page, + sector, iosize, pg_offset, + bdev, &epd->bio, max_nr, end_bio_extent_writepage, 0, 0, 0); if (ret) @@ -2336,11 +2303,8 @@ done: unlock_extent(tree, unlock_start, page_end, GFP_NOFS); unlock_page(page); -update_nr_written: - wbc->nr_to_write -= nr_written; - if (wbc->range_cyclic || (wbc->nr_to_write > 0 && - wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) - page->mapping->writeback_index = page->index + nr_written; +done_unlocked: + return 0; } @@ -2460,15 +2424,23 @@ retry: return ret; } -static noinline void flush_write_bio(void *data) +static void flush_epd_write_bio(struct extent_page_data *epd) { - struct extent_page_data *epd = data; if (epd->bio) { - submit_one_bio(WRITE, epd->bio, 0, 0); + if (epd->sync_io) + submit_one_bio(WRITE_SYNC, epd->bio, 0, 0); + else + submit_one_bio(WRITE, epd->bio, 0, 0); epd->bio = NULL; } } +static noinline void flush_write_bio(void *data) +{ + struct extent_page_data *epd = data; + flush_epd_write_bio(epd); +} + int extent_write_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, struct writeback_control *wbc) @@ -2480,23 +2452,22 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, .tree = tree, .get_extent = get_extent, .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; struct writeback_control wbc_writepages = { .bdi = wbc->bdi, - .sync_mode = WB_SYNC_NONE, + .sync_mode = wbc->sync_mode, .older_than_this = NULL, .nr_to_write = 64, .range_start = page_offset(page) + PAGE_CACHE_SIZE, .range_end = (loff_t)-1, }; - ret = __extent_writepage(page, wbc, &epd); extent_write_cache_pages(tree, mapping, &wbc_writepages, __extent_writepage, &epd, flush_write_bio); - if (epd.bio) - submit_one_bio(WRITE, epd.bio, 0, 0); + flush_epd_write_bio(&epd); return ret; } @@ -2515,6 +2486,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, .tree = tree, .get_extent = get_extent, .extent_locked = 1, + .sync_io = mode == WB_SYNC_ALL, }; struct writeback_control wbc_writepages = { .bdi = inode->i_mapping->backing_dev_info, @@ -2540,8 +2512,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, start += PAGE_CACHE_SIZE; } - if (epd.bio) - submit_one_bio(WRITE, epd.bio, 0, 0); + flush_epd_write_bio(&epd); return ret; } @@ -2556,13 +2527,13 @@ int extent_writepages(struct extent_io_tree *tree, .tree = tree, .get_extent = get_extent, .extent_locked = 0, + .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; ret = extent_write_cache_pages(tree, mapping, wbc, __extent_writepage, &epd, flush_write_bio); - if (epd.bio) - submit_one_bio(WRITE, epd.bio, 0, 0); + flush_epd_write_bio(&epd); return ret; } @@ -2884,25 +2855,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, disko = 0; flags = 0; - switch (em->block_start) { - case EXTENT_MAP_LAST_BYTE: + if (em->block_start == EXTENT_MAP_LAST_BYTE) { end = 1; flags |= FIEMAP_EXTENT_LAST; - break; - case EXTENT_MAP_HOLE: + } else if (em->block_start == EXTENT_MAP_HOLE) { flags |= FIEMAP_EXTENT_UNWRITTEN; - break; - case EXTENT_MAP_INLINE: + } else if (em->block_start == EXTENT_MAP_INLINE) { flags |= (FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED); - break; - case EXTENT_MAP_DELALLOC: + } else if (em->block_start == EXTENT_MAP_DELALLOC) { flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); - break; - default: + } else { disko = em->block_start; - break; } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 50da69da20ce..30c9365861e6 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -6,19 +6,14 @@ #include <linux/hardirq.h> #include "extent_map.h" -/* temporary define until extent_map moves out of btrfs */ -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *, struct kmem_cache *, - unsigned long)); static struct kmem_cache *extent_map_cache; int __init extent_map_init(void) { - extent_map_cache = btrfs_cache_create("extent_map", - sizeof(struct extent_map), 0, - NULL); + extent_map_cache = kmem_cache_create("extent_map", + sizeof(struct extent_map), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!extent_map_cache) return -ENOMEM; return 0; @@ -43,7 +38,6 @@ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) tree->map.rb_node = NULL; spin_lock_init(&tree->lock); } -EXPORT_SYMBOL(extent_map_tree_init); /** * alloc_extent_map - allocate new extent map structure @@ -64,7 +58,6 @@ struct extent_map *alloc_extent_map(gfp_t mask) atomic_set(&em->refs, 1); return em; } -EXPORT_SYMBOL(alloc_extent_map); /** * free_extent_map - drop reference count of an extent_map @@ -83,7 +76,6 @@ void free_extent_map(struct extent_map *em) kmem_cache_free(extent_map_cache, em); } } -EXPORT_SYMBOL(free_extent_map); static struct rb_node *tree_insert(struct rb_root *root, u64 offset, struct rb_node *node) @@ -234,7 +226,6 @@ int add_extent_mapping(struct extent_map_tree *tree, rb = tree_insert(&tree->map, em->start, &em->rb_node); if (rb) { ret = -EEXIST; - free_extent_map(merge); goto out; } atomic_inc(&em->refs); @@ -265,7 +256,6 @@ int add_extent_mapping(struct extent_map_tree *tree, out: return ret; } -EXPORT_SYMBOL(add_extent_mapping); /* simple helper to do math around the end of an extent, handling wrap */ static u64 range_end(u64 start, u64 len) @@ -327,7 +317,6 @@ found: out: return em; } -EXPORT_SYMBOL(lookup_extent_mapping); /** * remove_extent_mapping - removes an extent_map from the extent tree @@ -347,4 +336,3 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) em->in_tree = 0; return ret; } -EXPORT_SYMBOL(remove_extent_mapping); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9c9fb46ccd08..1d51dc38bb49 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -272,83 +272,6 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, return 0; } -int btrfs_check_file(struct btrfs_root *root, struct inode *inode) -{ - return 0; -#if 0 - struct btrfs_path *path; - struct btrfs_key found_key; - struct extent_buffer *leaf; - struct btrfs_file_extent_item *extent; - u64 last_offset = 0; - int nritems; - int slot; - int found_type; - int ret; - int err = 0; - u64 extent_end = 0; - - path = btrfs_alloc_path(); - ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino, - last_offset, 0); - while (1) { - nritems = btrfs_header_nritems(path->nodes[0]); - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret) - goto out; - nritems = btrfs_header_nritems(path->nodes[0]); - } - slot = path->slots[0]; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, slot); - if (found_key.objectid != inode->i_ino) - break; - if (found_key.type != BTRFS_EXTENT_DATA_KEY) - goto out; - - if (found_key.offset < last_offset) { - WARN_ON(1); - btrfs_print_leaf(root, leaf); - printk(KERN_ERR "inode %lu found offset %llu " - "expected %llu\n", inode->i_ino, - (unsigned long long)found_key.offset, - (unsigned long long)last_offset); - err = 1; - goto out; - } - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(leaf, extent); - if (found_type == BTRFS_FILE_EXTENT_REG) { - extent_end = found_key.offset + - btrfs_file_extent_num_bytes(leaf, extent); - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - struct btrfs_item *item; - item = btrfs_item_nr(leaf, slot); - extent_end = found_key.offset + - btrfs_file_extent_inline_len(leaf, extent); - extent_end = (extent_end + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); - } - last_offset = extent_end; - path->slots[0]++; - } - if (0 && last_offset < inode->i_size) { - WARN_ON(1); - btrfs_print_leaf(root, leaf); - printk(KERN_ERR "inode %lu found offset %llu size %llu\n", - inode->i_ino, (unsigned long long)last_offset, - (unsigned long long)inode->i_size); - err = 1; - - } -out: - btrfs_free_path(path); - return err; -#endif -} - /* * this is very complex, but the basic idea is to drop all extents * in the range start - end. hint_block is filled in with a block number @@ -363,15 +286,16 @@ out: */ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - u64 start, u64 end, u64 inline_limit, u64 *hint_byte) + u64 start, u64 end, u64 locked_end, + u64 inline_limit, u64 *hint_byte) { u64 extent_end = 0; - u64 locked_end = end; u64 search_start = start; u64 leaf_start; u64 ram_bytes = 0; u64 orig_parent = 0; u64 disk_bytenr = 0; + u64 orig_locked_end = locked_end; u8 compression; u8 encryption; u16 other_encoding = 0; @@ -684,11 +608,10 @@ next_slot: } out: btrfs_free_path(path); - if (locked_end > end) { - unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, - GFP_NOFS); + if (locked_end > orig_locked_end) { + unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end, + locked_end - 1, GFP_NOFS); } - btrfs_check_file(root, inode); return ret; } @@ -830,7 +753,7 @@ again: ret = btrfs_del_items(trans, root, path, del_slot, del_nr); BUG_ON(ret); - goto done; + goto release; } else if (split == start) { if (locked_end < extent_end) { ret = try_lock_extent(&BTRFS_I(inode)->io_tree, @@ -926,6 +849,8 @@ again: } done: btrfs_mark_buffer_dirty(leaf); + +release: btrfs_release_path(root, path); if (split_end && split == start) { split = end; @@ -1131,7 +1056,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, if (will_write) { btrfs_fdatawrite_range(inode->i_mapping, pos, pos + write_bytes - 1, - WB_SYNC_NONE); + WB_SYNC_ALL); } else { balance_dirty_pages_ratelimited_nr(inode->i_mapping, num_pages); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d1e5f0e84c58..0bc93657b460 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -18,6 +18,15 @@ #include <linux/sched.h> #include "ctree.h" +#include "free-space-cache.h" +#include "transaction.h" + +struct btrfs_free_space { + struct rb_node bytes_index; + struct rb_node offset_index; + u64 offset; + u64 bytes; +}; static int tree_insert_offset(struct rb_root *root, u64 offset, struct rb_node *node) @@ -68,14 +77,24 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes, } /* - * searches the tree for the given offset. If contains is set we will return - * the free space that contains the given offset. If contains is not set we - * will return the free space that starts at or after the given offset and is - * at least bytes long. + * searches the tree for the given offset. + * + * fuzzy == 1: this is used for allocations where we are given a hint of where + * to look for free space. Because the hint may not be completely on an offset + * mark, or the hint may no longer point to free space we need to fudge our + * results a bit. So we look for free space starting at or after offset with at + * least bytes size. We prefer to find as close to the given offset as we can. + * Also if the offset is within a free space range, then we will return the free + * space that contains the given offset, which means we can return a free space + * chunk with an offset before the provided offset. + * + * fuzzy == 0: this is just a normal tree search. Give us the free space that + * starts at the given offset which is at least bytes size, and if its not there + * return NULL. */ static struct btrfs_free_space *tree_search_offset(struct rb_root *root, u64 offset, u64 bytes, - int contains) + int fuzzy) { struct rb_node *n = root->rb_node; struct btrfs_free_space *entry, *ret = NULL; @@ -84,13 +103,14 @@ static struct btrfs_free_space *tree_search_offset(struct rb_root *root, entry = rb_entry(n, struct btrfs_free_space, offset_index); if (offset < entry->offset) { - if (!contains && + if (fuzzy && (!ret || entry->offset < ret->offset) && (bytes <= entry->bytes)) ret = entry; n = n->rb_left; } else if (offset > entry->offset) { - if ((entry->offset + entry->bytes - 1) >= offset && + if (fuzzy && + (entry->offset + entry->bytes - 1) >= offset && bytes <= entry->bytes) { ret = entry; break; @@ -171,6 +191,7 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, int ret = 0; + BUG_ON(!info->bytes); ret = tree_insert_offset(&block_group->free_space_offset, info->offset, &info->offset_index); if (ret) @@ -184,108 +205,70 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, return ret; } -static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) { struct btrfs_free_space *right_info; struct btrfs_free_space *left_info; struct btrfs_free_space *info = NULL; - struct btrfs_free_space *alloc_info; int ret = 0; - alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); - if (!alloc_info) + info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + if (!info) return -ENOMEM; + info->offset = offset; + info->bytes = bytes; + + spin_lock(&block_group->tree_lock); + /* * first we want to see if there is free space adjacent to the range we * are adding, if there is remove that struct and add a new one to * cover the entire range */ right_info = tree_search_offset(&block_group->free_space_offset, - offset+bytes, 0, 1); + offset+bytes, 0, 0); left_info = tree_search_offset(&block_group->free_space_offset, offset-1, 0, 1); - if (right_info && right_info->offset == offset+bytes) { + if (right_info) { unlink_free_space(block_group, right_info); - info = right_info; - info->offset = offset; - info->bytes += bytes; - } else if (right_info && right_info->offset != offset+bytes) { - printk(KERN_ERR "btrfs adding space in the middle of an " - "existing free space area. existing: " - "offset=%llu, bytes=%llu. new: offset=%llu, " - "bytes=%llu\n", (unsigned long long)right_info->offset, - (unsigned long long)right_info->bytes, - (unsigned long long)offset, - (unsigned long long)bytes); - BUG(); + info->bytes += right_info->bytes; + kfree(right_info); } - if (left_info) { + if (left_info && left_info->offset + left_info->bytes == offset) { unlink_free_space(block_group, left_info); - - if (unlikely((left_info->offset + left_info->bytes) != - offset)) { - printk(KERN_ERR "btrfs free space to the left " - "of new free space isn't " - "quite right. existing: offset=%llu, " - "bytes=%llu. new: offset=%llu, bytes=%llu\n", - (unsigned long long)left_info->offset, - (unsigned long long)left_info->bytes, - (unsigned long long)offset, - (unsigned long long)bytes); - BUG(); - } - - if (info) { - info->offset = left_info->offset; - info->bytes += left_info->bytes; - kfree(left_info); - } else { - info = left_info; - info->bytes += bytes; - } + info->offset = left_info->offset; + info->bytes += left_info->bytes; + kfree(left_info); } - if (info) { - ret = link_free_space(block_group, info); - if (!ret) - info = NULL; - goto out; - } - - info = alloc_info; - alloc_info = NULL; - info->offset = offset; - info->bytes = bytes; - ret = link_free_space(block_group, info); if (ret) kfree(info); -out: + + spin_unlock(&block_group->tree_lock); + if (ret) { printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); - if (ret == -EEXIST) - BUG(); + BUG_ON(ret == -EEXIST); } - kfree(alloc_info); - return ret; } -static int -__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) { struct btrfs_free_space *info; int ret = 0; + spin_lock(&block_group->tree_lock); + info = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); - if (info && info->offset == offset) { if (info->bytes < bytes) { printk(KERN_ERR "Found free space at %llu, size %llu," @@ -295,12 +278,14 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, (unsigned long long)bytes); WARN_ON(1); ret = -EINVAL; + spin_unlock(&block_group->tree_lock); goto out; } unlink_free_space(block_group, info); if (info->bytes == bytes) { kfree(info); + spin_unlock(&block_group->tree_lock); goto out; } @@ -308,6 +293,7 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, info->bytes -= bytes; ret = link_free_space(block_group, info); + spin_unlock(&block_group->tree_lock); BUG_ON(ret); } else if (info && info->offset < offset && info->offset + info->bytes >= offset + bytes) { @@ -333,70 +319,37 @@ __btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, */ kfree(info); } - + spin_unlock(&block_group->tree_lock); /* step two, insert a new info struct to cover anything * before the hole */ - ret = __btrfs_add_free_space(block_group, old_start, - offset - old_start); + ret = btrfs_add_free_space(block_group, old_start, + offset - old_start); BUG_ON(ret); } else { + spin_unlock(&block_group->tree_lock); + if (!info) { + printk(KERN_ERR "couldn't find space %llu to free\n", + (unsigned long long)offset); + printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n", + block_group->cached, + (unsigned long long)block_group->key.objectid, + (unsigned long long)block_group->key.offset); + btrfs_dump_free_space(block_group, bytes); + } else if (info) { + printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, " + "but wanted offset=%llu bytes=%llu\n", + (unsigned long long)info->offset, + (unsigned long long)info->bytes, + (unsigned long long)offset, + (unsigned long long)bytes); + } WARN_ON(1); } out: return ret; } -int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret; - struct btrfs_free_space *sp; - - mutex_lock(&block_group->alloc_mutex); - ret = __btrfs_add_free_space(block_group, offset, bytes); - sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); - BUG_ON(!sp); - mutex_unlock(&block_group->alloc_mutex); - - return ret; -} - -int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret; - struct btrfs_free_space *sp; - - ret = __btrfs_add_free_space(block_group, offset, bytes); - sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); - BUG_ON(!sp); - - return ret; -} - -int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret = 0; - - mutex_lock(&block_group->alloc_mutex); - ret = __btrfs_remove_free_space(block_group, offset, bytes); - mutex_unlock(&block_group->alloc_mutex); - - return ret; -} - -int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, - u64 offset, u64 bytes) -{ - int ret; - - ret = __btrfs_remove_free_space(block_group, offset, bytes); - - return ret; -} - void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, u64 bytes) { @@ -408,6 +361,9 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes) count++; + printk(KERN_ERR "entry offset %llu, bytes %llu\n", + (unsigned long long)info->offset, + (unsigned long long)info->bytes); } printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" "\n", count); @@ -428,68 +384,337 @@ u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group) return ret; } +/* + * for a given cluster, put all of its extents back into the free + * space cache. If the block group passed doesn't match the block group + * pointed to by the cluster, someone else raced in and freed the + * cluster already. In that case, we just return without changing anything + */ +static int +__btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster) +{ + struct btrfs_free_space *entry; + struct rb_node *node; + + spin_lock(&cluster->lock); + if (cluster->block_group != block_group) + goto out; + + cluster->window_start = 0; + node = rb_first(&cluster->root); + while(node) { + entry = rb_entry(node, struct btrfs_free_space, offset_index); + node = rb_next(&entry->offset_index); + rb_erase(&entry->offset_index, &cluster->root); + link_free_space(block_group, entry); + } + list_del_init(&cluster->block_group_list); + + btrfs_put_block_group(cluster->block_group); + cluster->block_group = NULL; + cluster->root.rb_node = NULL; +out: + spin_unlock(&cluster->lock); + return 0; +} + void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) { struct btrfs_free_space *info; struct rb_node *node; + struct btrfs_free_cluster *cluster; + struct btrfs_free_cluster *safe; + + spin_lock(&block_group->tree_lock); + + list_for_each_entry_safe(cluster, safe, &block_group->cluster_list, + block_group_list) { + + WARN_ON(cluster->block_group != block_group); + __btrfs_return_cluster_to_free_space(block_group, cluster); + } - mutex_lock(&block_group->alloc_mutex); while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { info = rb_entry(node, struct btrfs_free_space, bytes_index); unlink_free_space(block_group, info); kfree(info); if (need_resched()) { - mutex_unlock(&block_group->alloc_mutex); + spin_unlock(&block_group->tree_lock); cond_resched(); - mutex_lock(&block_group->alloc_mutex); + spin_lock(&block_group->tree_lock); } } - mutex_unlock(&block_group->alloc_mutex); + spin_unlock(&block_group->tree_lock); } -#if 0 -static struct btrfs_free_space *btrfs_find_free_space_offset(struct - btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes) +u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes, u64 empty_size) { - struct btrfs_free_space *ret; + struct btrfs_free_space *entry = NULL; + u64 ret = 0; - mutex_lock(&block_group->alloc_mutex); - ret = tree_search_offset(&block_group->free_space_offset, offset, - bytes, 0); - mutex_unlock(&block_group->alloc_mutex); + spin_lock(&block_group->tree_lock); + entry = tree_search_offset(&block_group->free_space_offset, offset, + bytes + empty_size, 1); + if (!entry) + entry = tree_search_bytes(&block_group->free_space_bytes, + offset, bytes + empty_size); + if (entry) { + unlink_free_space(block_group, entry); + ret = entry->offset; + entry->offset += bytes; + entry->bytes -= bytes; + + if (!entry->bytes) + kfree(entry); + else + link_free_space(block_group, entry); + } + spin_unlock(&block_group->tree_lock); return ret; } -static struct btrfs_free_space *btrfs_find_free_space_bytes(struct - btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes) +/* + * given a cluster, put all of its extents back into the free space + * cache. If a block group is passed, this function will only free + * a cluster that belongs to the passed block group. + * + * Otherwise, it'll get a reference on the block group pointed to by the + * cluster and remove the cluster from it. + */ +int btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster) { - struct btrfs_free_space *ret; + int ret; - mutex_lock(&block_group->alloc_mutex); + /* first, get a safe pointer to the block group */ + spin_lock(&cluster->lock); + if (!block_group) { + block_group = cluster->block_group; + if (!block_group) { + spin_unlock(&cluster->lock); + return 0; + } + } else if (cluster->block_group != block_group) { + /* someone else has already freed it don't redo their work */ + spin_unlock(&cluster->lock); + return 0; + } + atomic_inc(&block_group->count); + spin_unlock(&cluster->lock); - ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes); - mutex_unlock(&block_group->alloc_mutex); + /* now return any extents the cluster had on it */ + spin_lock(&block_group->tree_lock); + ret = __btrfs_return_cluster_to_free_space(block_group, cluster); + spin_unlock(&block_group->tree_lock); + /* finally drop our ref */ + btrfs_put_block_group(block_group); return ret; } -#endif -struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache - *block_group, u64 offset, - u64 bytes) +/* + * given a cluster, try to allocate 'bytes' from it, returns 0 + * if it couldn't find anything suitably large, or a logical disk offset + * if things worked out + */ +u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, u64 bytes, + u64 min_start) { - struct btrfs_free_space *ret = NULL; + struct btrfs_free_space *entry = NULL; + struct rb_node *node; + u64 ret = 0; - ret = tree_search_offset(&block_group->free_space_offset, offset, - bytes, 0); - if (!ret) - ret = tree_search_bytes(&block_group->free_space_bytes, - offset, bytes); + spin_lock(&cluster->lock); + if (bytes > cluster->max_size) + goto out; + if (cluster->block_group != block_group) + goto out; + + node = rb_first(&cluster->root); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + + while(1) { + if (entry->bytes < bytes || entry->offset < min_start) { + struct rb_node *node; + + node = rb_next(&entry->offset_index); + if (!node) + break; + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } + ret = entry->offset; + + entry->offset += bytes; + entry->bytes -= bytes; + + if (entry->bytes == 0) { + rb_erase(&entry->offset_index, &cluster->root); + kfree(entry); + } + break; + } +out: + spin_unlock(&cluster->lock); return ret; } + +/* + * here we try to find a cluster of blocks in a block group. The goal + * is to find at least bytes free and up to empty_size + bytes free. + * We might not find them all in one contiguous area. + * + * returns zero and sets up cluster if things worked out, otherwise + * it returns -enospc + */ +int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size) +{ + struct btrfs_free_space *entry = NULL; + struct rb_node *node; + struct btrfs_free_space *next; + struct btrfs_free_space *last; + u64 min_bytes; + u64 window_start; + u64 window_free; + u64 max_extent = 0; + int total_retries = 0; + int ret; + + /* for metadata, allow allocates with more holes */ + if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { + /* + * we want to do larger allocations when we are + * flushing out the delayed refs, it helps prevent + * making more work as we go along. + */ + if (trans->transaction->delayed_refs.flushing) + min_bytes = max(bytes, (bytes + empty_size) >> 1); + else + min_bytes = max(bytes, (bytes + empty_size) >> 4); + } else + min_bytes = max(bytes, (bytes + empty_size) >> 2); + + spin_lock(&block_group->tree_lock); + spin_lock(&cluster->lock); + + /* someone already found a cluster, hooray */ + if (cluster->block_group) { + ret = 0; + goto out; + } +again: + min_bytes = min(min_bytes, bytes + empty_size); + entry = tree_search_bytes(&block_group->free_space_bytes, + offset, min_bytes); + if (!entry) { + ret = -ENOSPC; + goto out; + } + window_start = entry->offset; + window_free = entry->bytes; + last = entry; + max_extent = entry->bytes; + + while(1) { + /* out window is just right, lets fill it */ + if (window_free >= bytes + empty_size) + break; + + node = rb_next(&last->offset_index); + if (!node) { + ret = -ENOSPC; + goto out; + } + next = rb_entry(node, struct btrfs_free_space, offset_index); + + /* + * we haven't filled the empty size and the window is + * very large. reset and try again + */ + if (next->offset - window_start > (bytes + empty_size) * 2) { + entry = next; + window_start = entry->offset; + window_free = entry->bytes; + last = entry; + max_extent = 0; + total_retries++; + if (total_retries % 256 == 0) { + if (min_bytes >= (bytes + empty_size)) { + ret = -ENOSPC; + goto out; + } + /* + * grow our allocation a bit, we're not having + * much luck + */ + min_bytes *= 2; + goto again; + } + } else { + last = next; + window_free += next->bytes; + if (entry->bytes > max_extent) + max_extent = entry->bytes; + } + } + + cluster->window_start = entry->offset; + + /* + * now we've found our entries, pull them out of the free space + * cache and put them into the cluster rbtree + * + * The cluster includes an rbtree, but only uses the offset index + * of each free space cache entry. + */ + while(1) { + node = rb_next(&entry->offset_index); + unlink_free_space(block_group, entry); + ret = tree_insert_offset(&cluster->root, entry->offset, + &entry->offset_index); + BUG_ON(ret); + + if (!node || entry == last) + break; + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + } + ret = 0; + cluster->max_size = max_extent; + atomic_inc(&block_group->count); + list_add_tail(&cluster->block_group_list, &block_group->cluster_list); + cluster->block_group = block_group; +out: + spin_unlock(&cluster->lock); + spin_unlock(&block_group->tree_lock); + + return ret; +} + +/* + * simple code to zero out a cluster + */ +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) +{ + spin_lock_init(&cluster->lock); + spin_lock_init(&cluster->refill_lock); + cluster->root.rb_node = NULL; + cluster->max_size = 0; + INIT_LIST_HEAD(&cluster->block_group_list); + cluster->block_group = NULL; +} + diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h new file mode 100644 index 000000000000..ab0bdc0a63ce --- /dev/null +++ b/fs/btrfs/free-space-cache.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2009 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_FREE_SPACE_CACHE +#define __BTRFS_FREE_SPACE_CACHE + +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache + *block_group); +u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes, u64 empty_size); +void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, + u64 bytes); +u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); +int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 empty_size); +void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster); +u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, u64 bytes, + u64 min_start); +int btrfs_return_cluster_to_free_space( + struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster); +#endif diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index cc7334d833c9..9abbced1123d 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -79,7 +79,7 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, } path = btrfs_alloc_path(); BUG_ON(!path); - search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID); + search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID); search_key.objectid = search_start; search_key.type = 0; search_key.offset = 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06d8db5afb08..90c23eb28829 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -70,7 +70,6 @@ static struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; -struct kmem_cache *btrfs_bit_radix_cachep; struct kmem_cache *btrfs_path_cachep; #define S_SHIFT 12 @@ -234,7 +233,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, } ret = btrfs_drop_extents(trans, root, inode, start, - aligned_end, start, &hint_byte); + aligned_end, aligned_end, start, &hint_byte); BUG_ON(ret); if (isize > actual_end) @@ -1439,6 +1438,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct inode *inode, u64 file_pos, u64 disk_bytenr, u64 disk_num_bytes, u64 num_bytes, u64 ram_bytes, + u64 locked_end, u8 compression, u8 encryption, u16 other_encoding, int extent_type) { @@ -1455,7 +1455,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; ret = btrfs_drop_extents(trans, root, inode, file_pos, - file_pos + num_bytes, file_pos, &hint); + file_pos + num_bytes, locked_end, + file_pos, &hint); BUG_ON(ret); ins.objectid = inode->i_ino; @@ -1590,6 +1591,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->disk_len, ordered_extent->len, ordered_extent->len, + ordered_extent->file_offset + + ordered_extent->len, compressed, 0, 0, BTRFS_FILE_EXTENT_REG); BUG_ON(ret); @@ -1819,10 +1822,12 @@ good: return 0; zeroit: - printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " - "private %llu\n", page->mapping->host->i_ino, - (unsigned long long)start, csum, - (unsigned long long)private); + if (printk_ratelimit()) { + printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " + "private %llu\n", page->mapping->host->i_ino, + (unsigned long long)start, csum, + (unsigned long long)private); + } memset(kaddr + offset, 1, end - start + 1); flush_dcache_page(page); kunmap_atomic(kaddr, KM_USER0); @@ -2011,6 +2016,57 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) } /* + * very simple check to peek ahead in the leaf looking for xattrs. If we + * don't find any xattrs, we know there can't be any acls. + * + * slot is the slot the inode is in, objectid is the objectid of the inode + */ +static noinline int acls_after_inode_item(struct extent_buffer *leaf, + int slot, u64 objectid) +{ + u32 nritems = btrfs_header_nritems(leaf); + struct btrfs_key found_key; + int scanned = 0; + + slot++; + while (slot < nritems) { + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* we found a different objectid, there must not be acls */ + if (found_key.objectid != objectid) + return 0; + + /* we found an xattr, assume we've got an acl */ + if (found_key.type == BTRFS_XATTR_ITEM_KEY) + return 1; + + /* + * we found a key greater than an xattr key, there can't + * be any acls later on + */ + if (found_key.type > BTRFS_XATTR_ITEM_KEY) + return 0; + + slot++; + scanned++; + + /* + * it goes inode, inode backrefs, xattrs, extents, + * so if there are a ton of hard links to an inode there can + * be a lot of backrefs. Don't waste time searching too hard, + * this is just an optimization + */ + if (scanned >= 8) + break; + } + /* we hit the end of the leaf before we found an xattr or + * something larger than an xattr. We have to assume the inode + * has acls + */ + return 1; +} + +/* * read an inode from the btree into the in-memory inode */ void btrfs_read_locked_inode(struct inode *inode) @@ -2021,6 +2077,7 @@ void btrfs_read_locked_inode(struct inode *inode) struct btrfs_timespec *tspec; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; + int maybe_acls; u64 alloc_group_block; u32 rdev; int ret; @@ -2067,6 +2124,16 @@ void btrfs_read_locked_inode(struct inode *inode) alloc_group_block = btrfs_inode_block_group(leaf, inode_item); + /* + * try to precache a NULL acl entry for files that don't have + * any xattrs or acls + */ + maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino); + if (!maybe_acls) { + BTRFS_I(inode)->i_acl = NULL; + BTRFS_I(inode)->i_default_acl = NULL; + } + BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, alloc_group_block, 0); btrfs_free_path(path); @@ -2877,6 +2944,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) err = btrfs_drop_extents(trans, root, inode, cur_offset, cur_offset + hole_size, + block_end, cur_offset, &hint_byte); if (err) break; @@ -3041,8 +3109,8 @@ static noinline void init_btrfs_i(struct inode *inode) { struct btrfs_inode *bi = BTRFS_I(inode); - bi->i_acl = NULL; - bi->i_default_acl = NULL; + bi->i_acl = BTRFS_ACL_NOT_CACHED; + bi->i_default_acl = BTRFS_ACL_NOT_CACHED; bi->generation = 0; bi->sequence = 0; @@ -3481,8 +3549,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (dir) { ret = btrfs_set_inode_index(dir, index); - if (ret) + if (ret) { + iput(inode); return ERR_PTR(ret); + } } /* * index_cnt is ignored for everything but a dir, @@ -3565,6 +3635,7 @@ fail: if (dir) BTRFS_I(dir)->index_cnt--; btrfs_free_path(path); + iput(inode); return ERR_PTR(ret); } @@ -4631,47 +4702,36 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_trans_handle_cachep); if (btrfs_transaction_cachep) kmem_cache_destroy(btrfs_transaction_cachep); - if (btrfs_bit_radix_cachep) - kmem_cache_destroy(btrfs_bit_radix_cachep); if (btrfs_path_cachep) kmem_cache_destroy(btrfs_path_cachep); } -struct kmem_cache *btrfs_cache_create(const char *name, size_t size, - unsigned long extra_flags, - void (*ctor)(void *)) -{ - return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD | extra_flags), ctor); -} - int btrfs_init_cachep(void) { - btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache", - sizeof(struct btrfs_inode), - 0, init_once); + btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache", + sizeof(struct btrfs_inode), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); if (!btrfs_inode_cachep) goto fail; - btrfs_trans_handle_cachep = - btrfs_cache_create("btrfs_trans_handle_cache", - sizeof(struct btrfs_trans_handle), - 0, NULL); + + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache", + sizeof(struct btrfs_trans_handle), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_trans_handle_cachep) goto fail; - btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache", - sizeof(struct btrfs_transaction), - 0, NULL); + + btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache", + sizeof(struct btrfs_transaction), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_transaction_cachep) goto fail; - btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache", - sizeof(struct btrfs_path), - 0, NULL); + + btrfs_path_cachep = kmem_cache_create("btrfs_path_cache", + sizeof(struct btrfs_path), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); if (!btrfs_path_cachep) goto fail; - btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256, - SLAB_DESTROY_BY_RCU, NULL); - if (!btrfs_bit_radix_cachep) - goto fail; + return 0; fail: btrfs_destroy_cachep(); @@ -4967,10 +5027,10 @@ out_fail: return err; } -static int prealloc_file_range(struct inode *inode, u64 start, u64 end, - u64 alloc_hint, int mode) +static int prealloc_file_range(struct btrfs_trans_handle *trans, + struct inode *inode, u64 start, u64 end, + u64 locked_end, u64 alloc_hint, int mode) { - struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 alloc_size; @@ -4978,10 +5038,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, u64 num_bytes = end - start; int ret = 0; - trans = btrfs_join_transaction(root, 1); - BUG_ON(!trans); - btrfs_set_trans_block_group(trans, inode); - while (num_bytes > 0) { alloc_size = min(num_bytes, root->fs_info->max_extent); ret = btrfs_reserve_extent(trans, root, alloc_size, @@ -4994,7 +5050,8 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, ret = insert_reserved_file_extent(trans, inode, cur_offset, ins.objectid, ins.offset, ins.offset, - ins.offset, 0, 0, 0, + ins.offset, locked_end, + 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); BUG_ON(ret); num_bytes -= ins.offset; @@ -5012,7 +5069,6 @@ out: BUG_ON(ret); } - btrfs_end_transaction(trans, root); return ret; } @@ -5024,13 +5080,21 @@ static long btrfs_fallocate(struct inode *inode, int mode, u64 alloc_start; u64 alloc_end; u64 alloc_hint = 0; + u64 locked_end; u64 mask = BTRFS_I(inode)->root->sectorsize - 1; struct extent_map *em; + struct btrfs_trans_handle *trans; int ret; alloc_start = offset & ~mask; alloc_end = (offset + len + mask) & ~mask; + /* + * wait for ordered IO before we have any locks. We'll loop again + * below with the locks held. + */ + btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); + mutex_lock(&inode->i_mutex); if (alloc_start > inode->i_size) { ret = btrfs_cont_expand(inode, alloc_start); @@ -5038,10 +5102,21 @@ static long btrfs_fallocate(struct inode *inode, int mode, goto out; } + locked_end = alloc_end - 1; while (1) { struct btrfs_ordered_extent *ordered; - lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, - alloc_end - 1, GFP_NOFS); + + trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); + if (!trans) { + ret = -EIO; + goto out; + } + + /* the extent lock is ordered inside the running + * transaction + */ + lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + GFP_NOFS); ordered = btrfs_lookup_first_ordered_extent(inode, alloc_end - 1); if (ordered && @@ -5049,7 +5124,13 @@ static long btrfs_fallocate(struct inode *inode, int mode, ordered->file_offset < alloc_end) { btrfs_put_ordered_extent(ordered); unlock_extent(&BTRFS_I(inode)->io_tree, - alloc_start, alloc_end - 1, GFP_NOFS); + alloc_start, locked_end, GFP_NOFS); + btrfs_end_transaction(trans, BTRFS_I(inode)->root); + + /* + * we can't wait on the range with the transaction + * running or with the extent lock held + */ btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); } else { @@ -5067,8 +5148,9 @@ static long btrfs_fallocate(struct inode *inode, int mode, last_byte = min(extent_map_end(em), alloc_end); last_byte = (last_byte + mask) & ~mask; if (em->block_start == EXTENT_MAP_HOLE) { - ret = prealloc_file_range(inode, cur_offset, - last_byte, alloc_hint, mode); + ret = prealloc_file_range(trans, inode, cur_offset, + last_byte, locked_end + 1, + alloc_hint, mode); if (ret < 0) { free_extent_map(em); break; @@ -5084,8 +5166,10 @@ static long btrfs_fallocate(struct inode *inode, int mode, break; } } - unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, + unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, GFP_NOFS); + + btrfs_end_transaction(trans, BTRFS_I(inode)->root); out: mutex_unlock(&inode->i_mutex); return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bca729fc80c8..5e94ea6e1cbe 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -267,7 +267,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, goto out_dput; if (!IS_POSIXACL(parent->dentry->d_inode)) - mode &= ~current->fs->umask; + mode &= ~current_umask(); error = mnt_want_write(parent->mnt); if (error) @@ -461,15 +461,9 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); - - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; namelen = strlen(vol_args->name); @@ -483,11 +477,13 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) *devstr = '\0'; devstr = vol_args->name; devid = simple_strtoull(devstr, &end, 10); - printk(KERN_INFO "resizing devid %llu\n", devid); + printk(KERN_INFO "resizing devid %llu\n", + (unsigned long long)devid); } device = btrfs_find_device(root, devid, NULL, NULL); if (!device) { - printk(KERN_INFO "resizer unable to find device %llu\n", devid); + printk(KERN_INFO "resizer unable to find device %llu\n", + (unsigned long long)devid); ret = -EINVAL; goto out_unlock; } @@ -545,7 +541,6 @@ static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) out_unlock: mutex_unlock(&root->fs_info->volume_mutex); -out: kfree(vol_args); return ret; } @@ -565,15 +560,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); - - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; namelen = strlen(vol_args->name); @@ -675,19 +664,13 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_init_new_device(root, vol_args->name); -out: kfree(vol_args); return ret; } @@ -703,19 +686,13 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; - vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + vol_args = memdup_user(arg, sizeof(*vol_args)); + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); - if (!vol_args) - return -ENOMEM; - - if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { - ret = -EFAULT; - goto out; - } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_rm_device(root, vol_args->name); -out: kfree(vol_args); return ret; } @@ -830,7 +807,8 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, BUG_ON(!trans); /* punch hole in destination first */ - btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte); + btrfs_drop_extents(trans, root, inode, off, off + len, + off + len, 0, &hint_byte); /* clone data */ key.objectid = src->i_ino; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index a5310c0f41e2..1c36e5cd8f55 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -60,8 +60,8 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb) /* * unfortunately, many of the places that currently set a lock to blocking - * don't end up blocking for every long, and often they don't block - * at all. For a dbench 50 run, if we don't spin one the blocking bit + * don't end up blocking for very long, and often they don't block + * at all. For a dbench 50 run, if we don't spin on the blocking bit * at all, the context switch rate can jump up to 400,000/sec or more. * * So, we're still stuck with this crummy spin on the blocking bit, diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 53c87b197d70..d6f0806c682f 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -489,7 +489,7 @@ again: /* start IO across the range first to instantiate any delalloc * extents */ - btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); + btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); /* The compression code will leave pages locked but return from * writepage without setting the page writeback. Starting again diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 19a4daf03ccb..3536bdb2d7cb 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -24,6 +24,7 @@ #include <linux/highmem.h> #include <linux/time.h> #include <linux/init.h> +#include <linux/seq_file.h> #include <linux/string.h> #include <linux/smp_lock.h> #include <linux/backing-dev.h> @@ -66,7 +67,8 @@ static void btrfs_put_super(struct super_block *sb) enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, - Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err, + Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_notreelog, + Opt_ratio, Opt_flushoncommit, Opt_err, }; static match_table_t tokens = { @@ -83,6 +85,9 @@ static match_table_t tokens = { {Opt_compress, "compress"}, {Opt_ssd, "ssd"}, {Opt_noacl, "noacl"}, + {Opt_notreelog, "notreelog"}, + {Opt_flushoncommit, "flushoncommit"}, + {Opt_ratio, "metadata_ratio=%d"}, {Opt_err, NULL}, }; @@ -191,7 +196,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) info->max_extent = max_t(u64, info->max_extent, root->sectorsize); printk(KERN_INFO "btrfs: max_extent at %llu\n", - info->max_extent); + (unsigned long long)info->max_extent); } break; case Opt_max_inline: @@ -206,7 +211,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) root->sectorsize); } printk(KERN_INFO "btrfs: max_inline at %llu\n", - info->max_inline); + (unsigned long long)info->max_inline); } break; case Opt_alloc_start: @@ -216,12 +221,29 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) kfree(num); printk(KERN_INFO "btrfs: allocations start at %llu\n", - info->alloc_start); + (unsigned long long)info->alloc_start); } break; case Opt_noacl: root->fs_info->sb->s_flags &= ~MS_POSIXACL; break; + case Opt_notreelog: + printk(KERN_INFO "btrfs: disabling tree log\n"); + btrfs_set_opt(info->mount_opt, NOTREELOG); + break; + case Opt_flushoncommit: + printk(KERN_INFO "btrfs: turning on flush-on-commit\n"); + btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT); + break; + case Opt_ratio: + intarg = 0; + match_int(&args[0], &intarg); + if (intarg) { + info->metadata_ratio = intarg; + printk(KERN_INFO "btrfs: metadata ratio %d\n", + info->metadata_ratio); + } + break; default: break; } @@ -363,9 +385,8 @@ fail_close: int btrfs_sync_fs(struct super_block *sb, int wait) { struct btrfs_trans_handle *trans; - struct btrfs_root *root; + struct btrfs_root *root = btrfs_sb(sb); int ret; - root = btrfs_sb(sb); if (sb->s_flags & MS_RDONLY) return 0; @@ -385,6 +406,44 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return ret; } +static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct btrfs_root *root = btrfs_sb(vfs->mnt_sb); + struct btrfs_fs_info *info = root->fs_info; + + if (btrfs_test_opt(root, DEGRADED)) + seq_puts(seq, ",degraded"); + if (btrfs_test_opt(root, NODATASUM)) + seq_puts(seq, ",nodatasum"); + if (btrfs_test_opt(root, NODATACOW)) + seq_puts(seq, ",nodatacow"); + if (btrfs_test_opt(root, NOBARRIER)) + seq_puts(seq, ",nobarrier"); + if (info->max_extent != (u64)-1) + seq_printf(seq, ",max_extent=%llu", + (unsigned long long)info->max_extent); + if (info->max_inline != 8192 * 1024) + seq_printf(seq, ",max_inline=%llu", + (unsigned long long)info->max_inline); + if (info->alloc_start != 0) + seq_printf(seq, ",alloc_start=%llu", + (unsigned long long)info->alloc_start); + if (info->thread_pool_size != min_t(unsigned long, + num_online_cpus() + 2, 8)) + seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); + if (btrfs_test_opt(root, COMPRESS)) + seq_puts(seq, ",compress"); + if (btrfs_test_opt(root, SSD)) + seq_puts(seq, ",ssd"); + if (btrfs_test_opt(root, NOTREELOG)) + seq_puts(seq, ",no-treelog"); + if (btrfs_test_opt(root, FLUSHONCOMMIT)) + seq_puts(seq, ",flush-on-commit"); + if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) + seq_puts(seq, ",noacl"); + return 0; +} + static void btrfs_write_super(struct super_block *sb) { sb->s_dirt = 0; @@ -589,14 +648,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - vol = kmalloc(sizeof(*vol), GFP_KERNEL); - if (!vol) - return -ENOMEM; - - if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) { - ret = -EFAULT; - goto out; - } + vol = memdup_user((void __user *)arg, sizeof(*vol)); + if (IS_ERR(vol)) + return PTR_ERR(vol); switch (cmd) { case BTRFS_IOC_SCAN_DEV: @@ -604,7 +658,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, &btrfs_fs_type, &fs_devices); break; } -out: + kfree(vol); return ret; } @@ -630,7 +684,7 @@ static struct super_operations btrfs_super_ops = { .put_super = btrfs_put_super, .write_super = btrfs_write_super, .sync_fs = btrfs_sync_fs, - .show_options = generic_show_options, + .show_options = btrfs_show_options, .write_inode = btrfs_write_inode, .dirty_inode = btrfs_dirty_inode, .alloc_inode = btrfs_alloc_inode, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 664782c6a2df..01b143605ec1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -53,8 +53,6 @@ static noinline int join_transaction(struct btrfs_root *root) GFP_NOFS); BUG_ON(!cur_trans); root->fs_info->generation++; - root->fs_info->last_alloc = 0; - root->fs_info->last_data_alloc = 0; cur_trans->num_writers = 1; cur_trans->num_joined = 0; cur_trans->transid = root->fs_info->generation; @@ -689,7 +687,13 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info) prepare_to_wait(&info->transaction_wait, &wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&info->trans_mutex); + + atomic_dec(&info->throttles); + wake_up(&info->transaction_throttle); + schedule(); + + atomic_inc(&info->throttles); mutex_lock(&info->trans_mutex); finish_wait(&info->transaction_wait, &wait); } @@ -974,6 +978,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, int ret; int should_grow = 0; unsigned long now = get_seconds(); + int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT); btrfs_run_ordered_operations(root, 0); @@ -1053,7 +1058,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_unlock(&root->fs_info->trans_mutex); - if (snap_pending) { + if (flush_on_commit || snap_pending) { + if (flush_on_commit) + btrfs_start_delalloc_inodes(root); ret = btrfs_wait_ordered_extents(root, 1); BUG_ON(ret); } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index fc9b87a7975b..db5e212e8445 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -262,11 +262,9 @@ static int process_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen) { - if (wc->pin) { - mutex_lock(&log->fs_info->pinned_mutex); + if (wc->pin) btrfs_update_pinned_extents(log->fs_info->extent_root, eb->start, eb->len, 1); - } if (btrfs_buffer_uptodate(eb, gen)) { if (wc->write) @@ -538,7 +536,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ ret = btrfs_drop_extents(trans, root, inode, - start, extent_end, start, &alloc_hint); + start, extent_end, extent_end, start, &alloc_hint); BUG_ON(ret); if (found_type == BTRFS_FILE_EXTENT_REG || @@ -1224,8 +1222,7 @@ insert: ret = insert_one_name(trans, root, path, key->objectid, key->offset, name, name_len, log_type, &log_key); - if (ret && ret != -ENOENT) - BUG(); + BUG_ON(ret && ret != -ENOENT); goto out; } @@ -2900,6 +2897,11 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, sb = inode->i_sb; + if (btrfs_test_opt(root, NOTREELOG)) { + ret = 1; + goto end_no_trans; + } + if (root->fs_info->last_trans_log_full_commit > root->fs_info->last_trans_committed) { ret = 1; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index dd06e18e5aac..5f01dad4b696 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -20,6 +20,7 @@ #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/random.h> +#include <linux/iocontext.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -124,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) return NULL; } +static void requeue_list(struct btrfs_pending_bios *pending_bios, + struct bio *head, struct bio *tail) +{ + + struct bio *old_head; + + old_head = pending_bios->head; + pending_bios->head = head; + if (pending_bios->tail) + tail->bi_next = old_head; + else + pending_bios->tail = tail; +} + /* * we try to collect pending bios for a device so we don't get a large * number of procs sending bios down to the same device. This greatly @@ -140,31 +155,44 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) struct bio *pending; struct backing_dev_info *bdi; struct btrfs_fs_info *fs_info; + struct btrfs_pending_bios *pending_bios; struct bio *tail; struct bio *cur; int again = 0; - unsigned long num_run = 0; + unsigned long num_run; + unsigned long num_sync_run; unsigned long limit; + unsigned long last_waited = 0; - bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; + bdi = blk_get_backing_dev_info(device->bdev); fs_info = device->dev_root->fs_info; limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; + /* we want to make sure that every time we switch from the sync + * list to the normal list, we unplug + */ + num_sync_run = 0; + loop: spin_lock(&device->io_lock); + num_run = 0; loop_lock: + /* take all the bios off the list at once and process them * later on (without the lock held). But, remember the * tail and other pointers so the bios can be properly reinserted * into the list if we hit congestion */ - pending = device->pending_bios; - tail = device->pending_bio_tail; + if (device->pending_sync_bios.head) + pending_bios = &device->pending_sync_bios; + else + pending_bios = &device->pending_bios; + + pending = pending_bios->head; + tail = pending_bios->tail; WARN_ON(pending && !tail); - device->pending_bios = NULL; - device->pending_bio_tail = NULL; /* * if pending was null this time around, no bios need processing @@ -174,16 +202,41 @@ loop_lock: * device->running_pending is used to synchronize with the * schedule_bio code. */ - if (pending) { - again = 1; - device->running_pending = 1; - } else { + if (device->pending_sync_bios.head == NULL && + device->pending_bios.head == NULL) { again = 0; device->running_pending = 0; + } else { + again = 1; + device->running_pending = 1; } + + pending_bios->head = NULL; + pending_bios->tail = NULL; + spin_unlock(&device->io_lock); + /* + * if we're doing the regular priority list, make sure we unplug + * for any high prio bios we've sent down + */ + if (pending_bios == &device->pending_bios && num_sync_run > 0) { + num_sync_run = 0; + blk_run_backing_dev(bdi, NULL); + } + while (pending) { + + rmb(); + if (pending_bios != &device->pending_sync_bios && + device->pending_sync_bios.head && + num_run > 16) { + cond_resched(); + spin_lock(&device->io_lock); + requeue_list(pending_bios, pending, tail); + goto loop_lock; + } + cur = pending; pending = pending->bi_next; cur->bi_next = NULL; @@ -194,10 +247,18 @@ loop_lock: wake_up(&fs_info->async_submit_wait); BUG_ON(atomic_read(&cur->bi_cnt) == 0); - bio_get(cur); submit_bio(cur->bi_rw, cur); - bio_put(cur); num_run++; + if (bio_sync(cur)) + num_sync_run++; + + if (need_resched()) { + if (num_sync_run) { + blk_run_backing_dev(bdi, NULL); + num_sync_run = 0; + } + cond_resched(); + } /* * we made progress, there is more work to do and the bdi @@ -206,17 +267,41 @@ loop_lock: */ if (pending && bdi_write_congested(bdi) && num_run > 16 && fs_info->fs_devices->open_devices > 1) { - struct bio *old_head; + struct io_context *ioc; - spin_lock(&device->io_lock); - - old_head = device->pending_bios; - device->pending_bios = pending; - if (device->pending_bio_tail) - tail->bi_next = old_head; - else - device->pending_bio_tail = tail; + ioc = current->io_context; + /* + * the main goal here is that we don't want to + * block if we're going to be able to submit + * more requests without blocking. + * + * This code does two great things, it pokes into + * the elevator code from a filesystem _and_ + * it makes assumptions about how batching works. + */ + if (ioc && ioc->nr_batch_requests > 0 && + time_before(jiffies, ioc->last_waited + HZ/50UL) && + (last_waited == 0 || + ioc->last_waited == last_waited)) { + /* + * we want to go through our batch of + * requests and stop. So, we copy out + * the ioc->last_waited time and test + * against it before looping + */ + last_waited = ioc->last_waited; + if (need_resched()) { + if (num_sync_run) { + blk_run_backing_dev(bdi, NULL); + num_sync_run = 0; + } + cond_resched(); + } + continue; + } + spin_lock(&device->io_lock); + requeue_list(pending_bios, pending, tail); device->running_pending = 1; spin_unlock(&device->io_lock); @@ -224,13 +309,32 @@ loop_lock: goto done; } } + + if (num_sync_run) { + num_sync_run = 0; + blk_run_backing_dev(bdi, NULL); + } + + cond_resched(); if (again) goto loop; spin_lock(&device->io_lock); - if (device->pending_bios) + if (device->pending_bios.head || device->pending_sync_bios.head) goto loop_lock; spin_unlock(&device->io_lock); + + /* + * IO has already been through a long path to get here. Checksumming, + * async helper threads, perhaps compression. We've done a pretty + * good job of collecting a batch of IO and should just unplug + * the device right away. + * + * This will help anyone who is waiting on the IO, they might have + * already unplugged, but managed to do so before the bio they + * cared about found its way down here. + */ + blk_run_backing_dev(bdi, NULL); done: return 0; } @@ -1439,7 +1543,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); - btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); + btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); btrfs_mark_buffer_dirty(leaf); @@ -1836,14 +1940,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) device->total_bytes = new_size; if (device->writeable) device->fs_devices->total_rw_bytes -= diff; - ret = btrfs_update_device(trans, device); - if (ret) { - unlock_chunks(root); - btrfs_end_transaction(trans, root); - goto done; - } - WARN_ON(diff > old_total); - btrfs_set_super_total_bytes(super_copy, old_total - diff); unlock_chunks(root); btrfs_end_transaction(trans, root); @@ -1875,7 +1971,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) length = btrfs_dev_extent_length(l, dev_extent); if (key.offset + length <= new_size) - goto done; + break; chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); @@ -1888,6 +1984,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) goto done; } + /* Shrinking succeeded, else we would be at "done". */ + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto done; + } + lock_chunks(root); + + device->disk_total_bytes = new_size; + /* Now btrfs_update_device() will change the on-disk size. */ + ret = btrfs_update_device(trans, device); + if (ret) { + unlock_chunks(root); + btrfs_end_transaction(trans, root); + goto done; + } + WARN_ON(diff > old_total); + btrfs_set_super_total_bytes(super_copy, old_total - diff); + unlock_chunks(root); + btrfs_end_transaction(trans, root); done: btrfs_free_path(path); return ret; @@ -2458,7 +2574,7 @@ again: max_errors = 1; } } - if (multi_ret && rw == WRITE && + if (multi_ret && (rw & (1 << BIO_RW)) && stripes_allocated < stripes_required) { stripes_allocated = map->num_stripes; free_extent_map(em); @@ -2723,6 +2839,7 @@ static noinline int schedule_bio(struct btrfs_root *root, int rw, struct bio *bio) { int should_queue = 1; + struct btrfs_pending_bios *pending_bios; /* don't bother with additional async steps for reads, right now */ if (!(rw & (1 << BIO_RW))) { @@ -2744,13 +2861,17 @@ static noinline int schedule_bio(struct btrfs_root *root, bio->bi_rw |= rw; spin_lock(&device->io_lock); + if (bio_sync(bio)) + pending_bios = &device->pending_sync_bios; + else + pending_bios = &device->pending_bios; - if (device->pending_bio_tail) - device->pending_bio_tail->bi_next = bio; + if (pending_bios->tail) + pending_bios->tail->bi_next = bio; - device->pending_bio_tail = bio; - if (!device->pending_bios) - device->pending_bios = bio; + pending_bios->tail = bio; + if (!pending_bios->head) + pending_bios->head = bio; if (device->running_pending) should_queue = 0; @@ -2967,7 +3088,8 @@ static int fill_device_from_item(struct extent_buffer *leaf, unsigned long ptr; device->devid = btrfs_device_id(leaf, dev_item); - device->total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->total_bytes = device->disk_total_bytes; device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); device->type = btrfs_device_type(leaf, dev_item); device->io_align = btrfs_device_io_align(leaf, dev_item); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 86c44e9ae110..5c3ff6d02fd7 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -23,13 +23,22 @@ #include "async-thread.h" struct buffer_head; +struct btrfs_pending_bios { + struct bio *head; + struct bio *tail; +}; + struct btrfs_device { struct list_head dev_list; struct list_head dev_alloc_list; struct btrfs_fs_devices *fs_devices; struct btrfs_root *dev_root; - struct bio *pending_bios; - struct bio *pending_bio_tail; + + /* regular prio bios */ + struct btrfs_pending_bios pending_bios; + /* WRITE_SYNC bios */ + struct btrfs_pending_bios pending_sync_bios; + int running_pending; u64 generation; @@ -52,6 +61,9 @@ struct btrfs_device { /* size of the device */ u64 total_bytes; + /* size of the disk */ + u64 disk_total_bytes; + /* bytes used */ u64 bytes_used; @@ -76,7 +88,7 @@ struct btrfs_device { struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ - /* the device with this id has the most recent coyp of the super */ + /* the device with this id has the most recent copy of the super */ u64 latest_devid; u64 latest_trans; u64 num_devices; |