diff options
Diffstat (limited to 'drivers/md/persistent-data')
-rw-r--r-- | drivers/md/persistent-data/dm-array.c | 52 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-btree-internal.h | 13 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-btree-remove.c | 7 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-btree-spine.c | 16 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-btree.c | 542 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-btree.h | 10 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map-common.c | 534 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map-common.h | 34 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map-disk.c | 83 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map-metadata.c | 105 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-space-map.h | 18 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-transaction-manager.c | 61 | ||||
-rw-r--r-- | drivers/md/persistent-data/dm-transaction-manager.h | 22 |
13 files changed, 1260 insertions, 237 deletions
diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 185dc60360b5..3a963d783a86 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -108,12 +108,10 @@ static void *element_at(struct dm_array_info *info, struct array_block *ab, * in an array block. */ static void on_entries(struct dm_array_info *info, struct array_block *ab, - void (*fn)(void *, const void *)) + void (*fn)(void *, const void *, unsigned)) { - unsigned i, nr_entries = le32_to_cpu(ab->nr_entries); - - for (i = 0; i < nr_entries; i++) - fn(info->value_type.context, element_at(info, ab, i)); + unsigned nr_entries = le32_to_cpu(ab->nr_entries); + fn(info->value_type.context, element_at(info, ab, 0), nr_entries); } /* @@ -175,19 +173,18 @@ static int alloc_ablock(struct dm_array_info *info, size_t size_of_block, static void fill_ablock(struct dm_array_info *info, struct array_block *ab, const void *value, unsigned new_nr) { - unsigned i; - uint32_t nr_entries; + uint32_t nr_entries, delta, i; struct dm_btree_value_type *vt = &info->value_type; BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); BUG_ON(new_nr < le32_to_cpu(ab->nr_entries)); nr_entries = le32_to_cpu(ab->nr_entries); - for (i = nr_entries; i < new_nr; i++) { - if (vt->inc) - vt->inc(vt->context, value); + delta = new_nr - nr_entries; + if (vt->inc) + vt->inc(vt->context, value, delta); + for (i = nr_entries; i < new_nr; i++) memcpy(element_at(info, ab, i), value, vt->size); - } ab->nr_entries = cpu_to_le32(new_nr); } @@ -199,17 +196,16 @@ static void fill_ablock(struct dm_array_info *info, struct array_block *ab, static void trim_ablock(struct dm_array_info *info, struct array_block *ab, unsigned new_nr) { - unsigned i; - uint32_t nr_entries; + uint32_t nr_entries, delta; struct dm_btree_value_type *vt = &info->value_type; BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); BUG_ON(new_nr > le32_to_cpu(ab->nr_entries)); nr_entries = le32_to_cpu(ab->nr_entries); - for (i = nr_entries; i > new_nr; i--) - if (vt->dec) - vt->dec(vt->context, element_at(info, ab, i - 1)); + delta = nr_entries - new_nr; + if (vt->dec) + vt->dec(vt->context, element_at(info, ab, new_nr - 1), delta); ab->nr_entries = cpu_to_le32(new_nr); } @@ -573,16 +569,17 @@ static int grow(struct resize *resize) * These are the value_type functions for the btree elements, which point * to array blocks. */ -static void block_inc(void *context, const void *value) +static void block_inc(void *context, const void *value, unsigned count) { - __le64 block_le; + const __le64 *block_le = value; struct dm_array_info *info = context; + unsigned i; - memcpy(&block_le, value, sizeof(block_le)); - dm_tm_inc(info->btree_info.tm, le64_to_cpu(block_le)); + for (i = 0; i < count; i++, block_le++) + dm_tm_inc(info->btree_info.tm, le64_to_cpu(*block_le)); } -static void block_dec(void *context, const void *value) +static void __block_dec(void *context, const void *value) { int r; uint64_t b; @@ -621,6 +618,13 @@ static void block_dec(void *context, const void *value) dm_tm_dec(info->btree_info.tm, b); } +static void block_dec(void *context, const void *value, unsigned count) +{ + unsigned i; + for (i = 0; i < count; i++, value += sizeof(__le64)) + __block_dec(context, value); +} + static int block_equal(void *context, const void *value1, const void *value2) { return !memcmp(value1, value2, sizeof(__le64)); @@ -711,7 +715,7 @@ static int populate_ablock_with_values(struct dm_array_info *info, struct array_ return r; if (vt->inc) - vt->inc(vt->context, element_at(info, ab, i)); + vt->inc(vt->context, element_at(info, ab, i), 1); } ab->nr_entries = cpu_to_le32(new_nr); @@ -822,9 +826,9 @@ static int array_set_value(struct dm_array_info *info, dm_block_t root, old_value = element_at(info, ab, entry); if (vt->dec && (!vt->equal || !vt->equal(vt->context, old_value, value))) { - vt->dec(vt->context, old_value); + vt->dec(vt->context, old_value, 1); if (vt->inc) - vt->inc(vt->context, value); + vt->inc(vt->context, value, 1); } memcpy(old_value, value, info->value_type.size); diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index b1788853a355..893edb426dba 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -144,4 +144,17 @@ extern struct dm_block_validator btree_node_validator; extern void init_le64_type(struct dm_transaction_manager *tm, struct dm_btree_value_type *vt); +/* + * This returns a shadowed btree leaf that you may modify. In practise + * this means overwrites only, since an insert could cause a node to + * be split. Useful if you need access to the old value to calculate the + * new one. + * + * This only works with single level btrees. The given key must be present in + * the tree, otherwise -EINVAL will be returned. + */ +int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root, + uint64_t key, int *index, + dm_block_t *new_root, struct dm_block **leaf); + #endif /* DM_BTREE_INTERNAL_H */ diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index eff04fa23dfa..70532335c7c7 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -544,12 +544,13 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, if (info->value_type.dec) info->value_type.dec(info->value_type.context, - value_ptr(n, index)); + value_ptr(n, index), 1); delete_at(n, index); } - *new_root = shadow_root(&spine); + if (!r) + *new_root = shadow_root(&spine); exit_shadow_spine(&spine); return r; @@ -653,7 +654,7 @@ static int remove_one(struct dm_btree_info *info, dm_block_t root, if (k >= keys[last_level] && k < end_key) { if (info->value_type.dec) info->value_type.dec(info->value_type.context, - value_ptr(n, index)); + value_ptr(n, index), 1); delete_at(n, index); keys[last_level] = k + 1ull; diff --git a/drivers/md/persistent-data/dm-btree-spine.c b/drivers/md/persistent-data/dm-btree-spine.c index 2061ab865567..f5bd76ed8fe6 100644 --- a/drivers/md/persistent-data/dm-btree-spine.c +++ b/drivers/md/persistent-data/dm-btree-spine.c @@ -236,22 +236,14 @@ dm_block_t shadow_root(struct shadow_spine *s) return s->root; } -static void le64_inc(void *context, const void *value_le) +static void le64_inc(void *context, const void *value_le, unsigned count) { - struct dm_transaction_manager *tm = context; - __le64 v_le; - - memcpy(&v_le, value_le, sizeof(v_le)); - dm_tm_inc(tm, le64_to_cpu(v_le)); + dm_tm_with_runs(context, value_le, count, dm_tm_inc_range); } -static void le64_dec(void *context, const void *value_le) +static void le64_dec(void *context, const void *value_le, unsigned count) { - struct dm_transaction_manager *tm = context; - __le64 v_le; - - memcpy(&v_le, value_le, sizeof(v_le)); - dm_tm_dec(tm, le64_to_cpu(v_le)); + dm_tm_with_runs(context, value_le, count, dm_tm_dec_range); } static int le64_equal(void *context, const void *value1_le, const void *value2_le) diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index ef6e78d45d5b..0703ca7a7d9a 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -71,15 +71,13 @@ static int upper_bound(struct btree_node *n, uint64_t key) void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, struct dm_btree_value_type *vt) { - unsigned i; uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) - for (i = 0; i < nr_entries; i++) - dm_tm_inc(tm, value64(n, i)); + dm_tm_with_runs(tm, value_ptr(n, 0), nr_entries, dm_tm_inc_range); + else if (vt->inc) - for (i = 0; i < nr_entries; i++) - vt->inc(vt->context, value_ptr(n, i)); + vt->inc(vt->context, value_ptr(n, 0), nr_entries); } static int insert_at(size_t value_size, struct btree_node *node, unsigned index, @@ -318,13 +316,9 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root) goto out; } else { - if (info->value_type.dec) { - unsigned i; - - for (i = 0; i < f->nr_children; i++) - info->value_type.dec(info->value_type.context, - value_ptr(f->n, i)); - } + if (info->value_type.dec) + info->value_type.dec(info->value_type.context, + value_ptr(f->n, 0), f->nr_children); pop_frame(s); } } @@ -500,6 +494,122 @@ out: EXPORT_SYMBOL_GPL(dm_btree_lookup_next); +/*----------------------------------------------------------------*/ + +/* + * Copies entries from one region of a btree node to another. The regions + * must not overlap. + */ +static void copy_entries(struct btree_node *dest, unsigned dest_offset, + struct btree_node *src, unsigned src_offset, + unsigned count) +{ + size_t value_size = le32_to_cpu(dest->header.value_size); + memcpy(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t)); + memcpy(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size); +} + +/* + * Moves entries from one region fo a btree node to another. The regions + * may overlap. + */ +static void move_entries(struct btree_node *dest, unsigned dest_offset, + struct btree_node *src, unsigned src_offset, + unsigned count) +{ + size_t value_size = le32_to_cpu(dest->header.value_size); + memmove(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t)); + memmove(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size); +} + +/* + * Erases the first 'count' entries of a btree node, shifting following + * entries down into their place. + */ +static void shift_down(struct btree_node *n, unsigned count) +{ + move_entries(n, 0, n, count, le32_to_cpu(n->header.nr_entries) - count); +} + +/* + * Moves entries in a btree node up 'count' places, making space for + * new entries at the start of the node. + */ +static void shift_up(struct btree_node *n, unsigned count) +{ + move_entries(n, count, n, 0, le32_to_cpu(n->header.nr_entries)); +} + +/* + * Redistributes entries between two btree nodes to make them + * have similar numbers of entries. + */ +static void redistribute2(struct btree_node *left, struct btree_node *right) +{ + unsigned nr_left = le32_to_cpu(left->header.nr_entries); + unsigned nr_right = le32_to_cpu(right->header.nr_entries); + unsigned total = nr_left + nr_right; + unsigned target_left = total / 2; + unsigned target_right = total - target_left; + + if (nr_left < target_left) { + unsigned delta = target_left - nr_left; + copy_entries(left, nr_left, right, 0, delta); + shift_down(right, delta); + } else if (nr_left > target_left) { + unsigned delta = nr_left - target_left; + if (nr_right) + shift_up(right, delta); + copy_entries(right, 0, left, target_left, delta); + } + + left->header.nr_entries = cpu_to_le32(target_left); + right->header.nr_entries = cpu_to_le32(target_right); +} + +/* + * Redistribute entries between three nodes. Assumes the central + * node is empty. + */ +static void redistribute3(struct btree_node *left, struct btree_node *center, + struct btree_node *right) +{ + unsigned nr_left = le32_to_cpu(left->header.nr_entries); + unsigned nr_center = le32_to_cpu(center->header.nr_entries); + unsigned nr_right = le32_to_cpu(right->header.nr_entries); + unsigned total, target_left, target_center, target_right; + + BUG_ON(nr_center); + + total = nr_left + nr_right; + target_left = total / 3; + target_center = (total - target_left) / 2; + target_right = (total - target_left - target_center); + + if (nr_left < target_left) { + unsigned left_short = target_left - nr_left; + copy_entries(left, nr_left, right, 0, left_short); + copy_entries(center, 0, right, left_short, target_center); + shift_down(right, nr_right - target_right); + + } else if (nr_left < (target_left + target_center)) { + unsigned left_to_center = nr_left - target_left; + copy_entries(center, 0, left, target_left, left_to_center); + copy_entries(center, left_to_center, right, 0, target_center - left_to_center); + shift_down(right, nr_right - target_right); + + } else { + unsigned right_short = target_right - nr_right; + shift_up(right, right_short); + copy_entries(right, 0, left, nr_left - right_short, right_short); + copy_entries(center, 0, left, target_left, nr_left - target_left); + } + + left->header.nr_entries = cpu_to_le32(target_left); + center->header.nr_entries = cpu_to_le32(target_center); + right->header.nr_entries = cpu_to_le32(target_right); +} + /* * Splits a node by creating a sibling node and shifting half the nodes * contents across. Assumes there is a parent node, and it has room for @@ -530,12 +640,10 @@ EXPORT_SYMBOL_GPL(dm_btree_lookup_next); * * Where A* is a shadow of A. */ -static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index, - uint64_t key) +static int split_one_into_two(struct shadow_spine *s, unsigned parent_index, + struct dm_btree_value_type *vt, uint64_t key) { int r; - size_t size; - unsigned nr_left, nr_right; struct dm_block *left, *right, *parent; struct btree_node *ln, *rn, *pn; __le64 location; @@ -549,36 +657,18 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index, ln = dm_block_data(left); rn = dm_block_data(right); - nr_left = le32_to_cpu(ln->header.nr_entries) / 2; - nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left; - - ln->header.nr_entries = cpu_to_le32(nr_left); - rn->header.flags = ln->header.flags; - rn->header.nr_entries = cpu_to_le32(nr_right); + rn->header.nr_entries = cpu_to_le32(0); rn->header.max_entries = ln->header.max_entries; rn->header.value_size = ln->header.value_size; - memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0])); - - size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? - sizeof(uint64_t) : s->info->value_type.size; - memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left), - size * nr_right); + redistribute2(ln, rn); - /* - * Patch up the parent - */ + /* patch up the parent */ parent = shadow_parent(s); - pn = dm_block_data(parent); - location = cpu_to_le64(dm_block_location(left)); - __dm_bless_for_disk(&location); - memcpy_disk(value_ptr(pn, parent_index), - &location, sizeof(__le64)); location = cpu_to_le64(dm_block_location(right)); __dm_bless_for_disk(&location); - r = insert_at(sizeof(__le64), pn, parent_index + 1, le64_to_cpu(rn->keys[0]), &location); if (r) { @@ -586,6 +676,7 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index, return r; } + /* patch up the spine */ if (key < le64_to_cpu(rn->keys[0])) { unlock_block(s->info, right); s->nodes[1] = left; @@ -598,6 +689,121 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index, } /* + * We often need to modify a sibling node. This function shadows a particular + * child of the given parent node. Making sure to update the parent to point + * to the new shadow. + */ +static int shadow_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, + struct btree_node *parent, unsigned index, + struct dm_block **result) +{ + int r, inc; + dm_block_t root; + struct btree_node *node; + + root = value64(parent, index); + + r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, + result, &inc); + if (r) + return r; + + node = dm_block_data(*result); + + if (inc) + inc_children(info->tm, node, vt); + + *((__le64 *) value_ptr(parent, index)) = + cpu_to_le64(dm_block_location(*result)); + + return 0; +} + +/* + * Splits two nodes into three. This is more work, but results in fuller + * nodes, so saves metadata space. + */ +static int split_two_into_three(struct shadow_spine *s, unsigned parent_index, + struct dm_btree_value_type *vt, uint64_t key) +{ + int r; + unsigned middle_index; + struct dm_block *left, *middle, *right, *parent; + struct btree_node *ln, *rn, *mn, *pn; + __le64 location; + + parent = shadow_parent(s); + pn = dm_block_data(parent); + + if (parent_index == 0) { + middle_index = 1; + left = shadow_current(s); + r = shadow_child(s->info, vt, pn, parent_index + 1, &right); + if (r) + return r; + } else { + middle_index = parent_index; + right = shadow_current(s); + r = shadow_child(s->info, vt, pn, parent_index - 1, &left); + if (r) + return r; + } + + r = new_block(s->info, &middle); + if (r < 0) + return r; + + ln = dm_block_data(left); + mn = dm_block_data(middle); + rn = dm_block_data(right); + + mn->header.nr_entries = cpu_to_le32(0); + mn->header.flags = ln->header.flags; + mn->header.max_entries = ln->header.max_entries; + mn->header.value_size = ln->header.value_size; + + redistribute3(ln, mn, rn); + + /* patch up the parent */ + pn->keys[middle_index] = rn->keys[0]; + location = cpu_to_le64(dm_block_location(middle)); + __dm_bless_for_disk(&location); + r = insert_at(sizeof(__le64), pn, middle_index, + le64_to_cpu(mn->keys[0]), &location); + if (r) { + if (shadow_current(s) != left) + unlock_block(s->info, left); + + unlock_block(s->info, middle); + + if (shadow_current(s) != right) + unlock_block(s->info, right); + + return r; + } + + + /* patch up the spine */ + if (key < le64_to_cpu(mn->keys[0])) { + unlock_block(s->info, middle); + unlock_block(s->info, right); + s->nodes[1] = left; + } else if (key < le64_to_cpu(rn->keys[0])) { + unlock_block(s->info, left); + unlock_block(s->info, right); + s->nodes[1] = middle; + } else { + unlock_block(s->info, left); + unlock_block(s->info, middle); + s->nodes[1] = right; + } + + return 0; +} + +/*----------------------------------------------------------------*/ + +/* * Splits a node by creating two new children beneath the given node. * * Before: @@ -690,6 +896,186 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) return 0; } +/*----------------------------------------------------------------*/ + +/* + * Redistributes a node's entries with its left sibling. + */ +static int rebalance_left(struct shadow_spine *s, struct dm_btree_value_type *vt, + unsigned parent_index, uint64_t key) +{ + int r; + struct dm_block *sib; + struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s)); + + r = shadow_child(s->info, vt, parent, parent_index - 1, &sib); + if (r) + return r; + + left = dm_block_data(sib); + right = dm_block_data(shadow_current(s)); + redistribute2(left, right); + *key_ptr(parent, parent_index) = right->keys[0]; + + if (key < le64_to_cpu(right->keys[0])) { + unlock_block(s->info, s->nodes[1]); + s->nodes[1] = sib; + } else { + unlock_block(s->info, sib); + } + + return 0; +} + +/* + * Redistributes a nodes entries with its right sibling. + */ +static int rebalance_right(struct shadow_spine *s, struct dm_btree_value_type *vt, + unsigned parent_index, uint64_t key) +{ + int r; + struct dm_block *sib; + struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s)); + + r = shadow_child(s->info, vt, parent, parent_index + 1, &sib); + if (r) + return r; + + left = dm_block_data(shadow_current(s)); + right = dm_block_data(sib); + redistribute2(left, right); + *key_ptr(parent, parent_index + 1) = right->keys[0]; + + if (key < le64_to_cpu(right->keys[0])) { + unlock_block(s->info, sib); + } else { + unlock_block(s->info, s->nodes[1]); + s->nodes[1] = sib; + } + + return 0; +} + +/* + * Returns the number of spare entries in a node. + */ +static int get_node_free_space(struct dm_btree_info *info, dm_block_t b, unsigned *space) +{ + int r; + unsigned nr_entries; + struct dm_block *block; + struct btree_node *node; + + r = bn_read_lock(info, b, &block); + if (r) + return r; + + node = dm_block_data(block); + nr_entries = le32_to_cpu(node->header.nr_entries); + *space = le32_to_cpu(node->header.max_entries) - nr_entries; + + unlock_block(info, block); + return 0; +} + +/* + * Make space in a node, either by moving some entries to a sibling, + * or creating a new sibling node. SPACE_THRESHOLD defines the minimum + * number of free entries that must be in the sibling to make the move + * worth while. If the siblings are shared (eg, part of a snapshot), + * then they are not touched, since this break sharing and so consume + * more space than we save. + */ +#define SPACE_THRESHOLD 8 +static int rebalance_or_split(struct shadow_spine *s, struct dm_btree_value_type *vt, + unsigned parent_index, uint64_t key) +{ + int r; + struct btree_node *parent = dm_block_data(shadow_parent(s)); + unsigned nr_parent = le32_to_cpu(parent->header.nr_entries); + unsigned free_space; + int left_shared = 0, right_shared = 0; + + /* Should we move entries to the left sibling? */ + if (parent_index > 0) { + dm_block_t left_b = value64(parent, parent_index - 1); + r = dm_tm_block_is_shared(s->info->tm, left_b, &left_shared); + if (r) + return r; + + if (!left_shared) { + r = get_node_free_space(s->info, left_b, &free_space); + if (r) + return r; + + if (free_space >= SPACE_THRESHOLD) + return rebalance_left(s, vt, parent_index, key); + } + } + + /* Should we move entries to the right sibling? */ + if (parent_index < (nr_parent - 1)) { + dm_block_t right_b = value64(parent, parent_index + 1); + r = dm_tm_block_is_shared(s->info->tm, right_b, &right_shared); + if (r) + return r; + + if (!right_shared) { + r = get_node_free_space(s->info, right_b, &free_space); + if (r) + return r; + + if (free_space >= SPACE_THRESHOLD) + return rebalance_right(s, vt, parent_index, key); + } + } + + /* + * We need to split the node, normally we split two nodes + * into three. But when inserting a sequence that is either + * monotonically increasing or decreasing it's better to split + * a single node into two. + */ + if (left_shared || right_shared || (nr_parent <= 2) || + (parent_index == 0) || (parent_index + 1 == nr_parent)) { + return split_one_into_two(s, parent_index, vt, key); + } else { + return split_two_into_three(s, parent_index, vt, key); + } +} + +/* + * Does the node contain a particular key? + */ +static bool contains_key(struct btree_node *node, uint64_t key) +{ + int i = lower_bound(node, key); + + if (i >= 0 && le64_to_cpu(node->keys[i]) == key) + return true; + + return false; +} + +/* + * In general we preemptively make sure there's a free entry in every + * node on the spine when doing an insert. But we can avoid that with + * leaf nodes if we know it's an overwrite. + */ +static bool has_space_for_insert(struct btree_node *node, uint64_t key) +{ + if (node->header.nr_entries == node->header.max_entries) { + if (le32_to_cpu(node->header.flags) & LEAF_NODE) { + /* we don't need space if it's an overwrite */ + return contains_key(node, key); + } + + return false; + } + + return true; +} + static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, struct dm_btree_value_type *vt, uint64_t key, unsigned *index) @@ -719,17 +1105,18 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, node = dm_block_data(shadow_current(s)); - if (node->header.nr_entries == node->header.max_entries) { + if (!has_space_for_insert(node, key)) { if (top) r = btree_split_beneath(s, key); else - r = btree_split_sibling(s, i, key); + r = rebalance_or_split(s, vt, i, key); if (r < 0) return r; - } - node = dm_block_data(shadow_current(s)); + /* making space can cause the current node to change */ + node = dm_block_data(shadow_current(s)); + } i = lower_bound(node, key); @@ -753,6 +1140,77 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, return 0; } +static int __btree_get_overwrite_leaf(struct shadow_spine *s, dm_block_t root, + uint64_t key, int *index) +{ + int r, i = -1; + struct btree_node *node; + + *index = 0; + for (;;) { + r = shadow_step(s, root, &s->info->value_type); + if (r < 0) + return r; + + node = dm_block_data(shadow_current(s)); + + /* + * We have to patch up the parent node, ugly, but I don't + * see a way to do this automatically as part of the spine + * op. + */ + if (shadow_has_parent(s) && i >= 0) { + __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); + + __dm_bless_for_disk(&location); + memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), + &location, sizeof(__le64)); + } + + node = dm_block_data(shadow_current(s)); + i = lower_bound(node, key); + + BUG_ON(i < 0); + BUG_ON(i >= le32_to_cpu(node->header.nr_entries)); + + if (le32_to_cpu(node->header.flags) & LEAF_NODE) { + if (key != le64_to_cpu(node->keys[i])) + return -EINVAL; + break; + } + + root = value64(node, i); + } + + *index = i; + return 0; +} + +int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root, + uint64_t key, int *index, + dm_block_t *new_root, struct dm_block **leaf) +{ + int r; + struct shadow_spine spine; + + BUG_ON(info->levels > 1); + init_shadow_spine(&spine, info); + r = __btree_get_overwrite_leaf(&spine, root, key, index); + if (!r) { + *new_root = shadow_root(&spine); + *leaf = shadow_current(&spine); + + /* + * Decrement the count so exit_shadow_spine() doesn't + * unlock the leaf. + */ + spine.count--; + } + exit_shadow_spine(&spine); + + return r; +} + static bool need_insert(struct btree_node *node, uint64_t *keys, unsigned level, unsigned index) { @@ -829,7 +1287,7 @@ static int insert(struct dm_btree_info *info, dm_block_t root, value_ptr(n, index), value))) { info->value_type.dec(info->value_type.context, - value_ptr(n, index)); + value_ptr(n, index), 1); } memcpy_disk(value_ptr(n, index), value, info->value_type.size); diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h index 3dc5bb1a4748..d2ae5aa4d00b 100644 --- a/drivers/md/persistent-data/dm-btree.h +++ b/drivers/md/persistent-data/dm-btree.h @@ -51,21 +51,21 @@ struct dm_btree_value_type { */ /* - * The btree is making a duplicate of the value, for instance + * The btree is making a duplicate of a run of values, for instance * because previously-shared btree nodes have now diverged. * @value argument is the new copy that the copy function may modify. * (Probably it just wants to increment a reference count * somewhere.) This method is _not_ called for insertion of a new * value: It is assumed the ref count is already 1. */ - void (*inc)(void *context, const void *value); + void (*inc)(void *context, const void *value, unsigned count); /* - * This value is being deleted. The btree takes care of freeing + * These values are being deleted. The btree takes care of freeing * the memory pointed to by @value. Often the del function just - * needs to decrement a reference count somewhere. + * needs to decrement a reference counts somewhere. */ - void (*dec)(void *context, const void *value); + void (*dec)(void *context, const void *value, unsigned count); /* * A test for equality between two values. When a value is diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index a213bf11738f..4a6a2a9b4eb4 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -6,6 +6,8 @@ #include "dm-space-map-common.h" #include "dm-transaction-manager.h" +#include "dm-btree-internal.h" +#include "dm-persistent-data-internal.h" #include <linux/bitops.h> #include <linux/device-mapper.h> @@ -409,12 +411,13 @@ int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll, return r; } -static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, - int (*mutator)(void *context, uint32_t old, uint32_t *new), - void *context, enum allocation_event *ev) +/*----------------------------------------------------------------*/ + +int sm_ll_insert(struct ll_disk *ll, dm_block_t b, + uint32_t ref_count, int32_t *nr_allocations) { int r; - uint32_t bit, old, ref_count; + uint32_t bit, old; struct dm_block *nb; dm_block_t index = b; struct disk_index_entry ie_disk; @@ -433,10 +436,9 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, return r; } ie_disk.blocknr = cpu_to_le64(dm_block_location(nb)); - bm_le = dm_bitmap_data(nb); - old = sm_lookup_bitmap(bm_le, bit); + old = sm_lookup_bitmap(bm_le, bit); if (old > 2) { r = sm_ll_lookup_big_ref_count(ll, b, &old); if (r < 0) { @@ -445,7 +447,6 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, } } - r = mutator(context, old, &ref_count); if (r) { dm_tm_unlock(ll->tm, nb); return r; @@ -453,7 +454,6 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, if (ref_count <= 2) { sm_set_bitmap(bm_le, bit, ref_count); - dm_tm_unlock(ll->tm, nb); if (old > 2) { @@ -480,62 +480,459 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, } if (ref_count && !old) { - *ev = SM_ALLOC; + *nr_allocations = 1; ll->nr_allocated++; le32_add_cpu(&ie_disk.nr_free, -1); if (le32_to_cpu(ie_disk.none_free_before) == bit) ie_disk.none_free_before = cpu_to_le32(bit + 1); } else if (old && !ref_count) { - *ev = SM_FREE; + *nr_allocations = -1; ll->nr_allocated--; le32_add_cpu(&ie_disk.nr_free, 1); ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit)); } else - *ev = SM_NONE; + *nr_allocations = 0; return ll->save_ie(ll, index, &ie_disk); } -static int set_ref_count(void *context, uint32_t old, uint32_t *new) +/*----------------------------------------------------------------*/ + +/* + * Holds useful intermediate results for the range based inc and dec + * operations. + */ +struct inc_context { + struct disk_index_entry ie_disk; + struct dm_block *bitmap_block; + void *bitmap; + + struct dm_block *overflow_leaf; +}; + +static inline void init_inc_context(struct inc_context *ic) +{ + ic->bitmap_block = NULL; + ic->bitmap = NULL; + ic->overflow_leaf = NULL; +} + +static inline void exit_inc_context(struct ll_disk *ll, struct inc_context *ic) +{ + if (ic->bitmap_block) + dm_tm_unlock(ll->tm, ic->bitmap_block); + if (ic->overflow_leaf) + dm_tm_unlock(ll->tm, ic->overflow_leaf); +} + +static inline void reset_inc_context(struct ll_disk *ll, struct inc_context *ic) +{ + exit_inc_context(ll, ic); + init_inc_context(ic); +} + +/* + * Confirms a btree node contains a particular key at an index. + */ +static bool contains_key(struct btree_node *n, uint64_t key, int index) +{ + return index >= 0 && + index < le32_to_cpu(n->header.nr_entries) && + le64_to_cpu(n->keys[index]) == key; +} + +static int __sm_ll_inc_overflow(struct ll_disk *ll, dm_block_t b, struct inc_context *ic) { - *new = *((uint32_t *) context); + int r; + int index; + struct btree_node *n; + __le32 *v_ptr; + uint32_t rc; + + /* + * bitmap_block needs to be unlocked because getting the + * overflow_leaf may need to allocate, and thus use the space map. + */ + reset_inc_context(ll, ic); + + r = btree_get_overwrite_leaf(&ll->ref_count_info, ll->ref_count_root, + b, &index, &ll->ref_count_root, &ic->overflow_leaf); + if (r < 0) + return r; + + n = dm_block_data(ic->overflow_leaf); + + if (!contains_key(n, b, index)) { + DMERR("overflow btree is missing an entry"); + return -EINVAL; + } + + v_ptr = value_ptr(n, index); + rc = le32_to_cpu(*v_ptr) + 1; + *v_ptr = cpu_to_le32(rc); + return 0; } -int sm_ll_insert(struct ll_disk *ll, dm_block_t b, - uint32_t ref_count, enum allocation_event *ev) +static int sm_ll_inc_overflow(struct ll_disk *ll, dm_block_t b, struct inc_context *ic) +{ + int index; + struct btree_node *n; + __le32 *v_ptr; + uint32_t rc; + + /* + * Do we already have the correct overflow leaf? + */ + if (ic->overflow_leaf) { + n = dm_block_data(ic->overflow_leaf); + index = lower_bound(n, b); + if (contains_key(n, b, index)) { + v_ptr = value_ptr(n, index); + rc = le32_to_cpu(*v_ptr) + 1; + *v_ptr = cpu_to_le32(rc); + + return 0; + } + } + + return __sm_ll_inc_overflow(ll, b, ic); +} + +static inline int shadow_bitmap(struct ll_disk *ll, struct inc_context *ic) +{ + int r, inc; + r = dm_tm_shadow_block(ll->tm, le64_to_cpu(ic->ie_disk.blocknr), + &dm_sm_bitmap_validator, &ic->bitmap_block, &inc); + if (r < 0) { + DMERR("dm_tm_shadow_block() failed"); + return r; + } + ic->ie_disk.blocknr = cpu_to_le64(dm_block_location(ic->bitmap_block)); + ic->bitmap = dm_bitmap_data(ic->bitmap_block); + return 0; +} + +/* + * Once shadow_bitmap has been called, which always happens at the start of inc/dec, + * we can reopen the bitmap with a simple write lock, rather than re calling + * dm_tm_shadow_block(). + */ +static inline int ensure_bitmap(struct ll_disk *ll, struct inc_context *ic) +{ + if (!ic->bitmap_block) { + int r = dm_bm_write_lock(dm_tm_get_bm(ll->tm), le64_to_cpu(ic->ie_disk.blocknr), + &dm_sm_bitmap_validator, &ic->bitmap_block); + if (r) { + DMERR("unable to re-get write lock for bitmap"); + return r; + } + ic->bitmap = dm_bitmap_data(ic->bitmap_block); + } + + return 0; +} + +/* + * Loops round incrementing entries in a single bitmap. + */ +static inline int sm_ll_inc_bitmap(struct ll_disk *ll, dm_block_t b, + uint32_t bit, uint32_t bit_end, + int32_t *nr_allocations, dm_block_t *new_b, + struct inc_context *ic) +{ + int r; + __le32 le_rc; + uint32_t old; + + for (; bit != bit_end; bit++, b++) { + /* + * We only need to drop the bitmap if we need to find a new btree + * leaf for the overflow. So if it was dropped last iteration, + * we now re-get it. + */ + r = ensure_bitmap(ll, ic); + if (r) + return r; + + old = sm_lookup_bitmap(ic->bitmap, bit); + switch (old) { + case 0: + /* inc bitmap, adjust nr_allocated */ + sm_set_bitmap(ic->bitmap, bit, 1); + (*nr_allocations)++; + ll->nr_allocated++; + le32_add_cpu(&ic->ie_disk.nr_free, -1); + if (le32_to_cpu(ic->ie_disk.none_free_before) == bit) + ic->ie_disk.none_free_before = cpu_to_le32(bit + 1); + break; + + case 1: + /* inc bitmap */ + sm_set_bitmap(ic->bitmap, bit, 2); + break; + + case 2: + /* inc bitmap and insert into overflow */ + sm_set_bitmap(ic->bitmap, bit, 3); + reset_inc_context(ll, ic); + + le_rc = cpu_to_le32(3); + __dm_bless_for_disk(&le_rc); + r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root, + &b, &le_rc, &ll->ref_count_root); + if (r < 0) { + DMERR("ref count insert failed"); + return r; + } + break; + + default: + /* + * inc within the overflow tree only. + */ + r = sm_ll_inc_overflow(ll, b, ic); + if (r < 0) + return r; + } + } + + *new_b = b; + return 0; +} + +/* + * Finds a bitmap that contains entries in the block range, and increments + * them. + */ +static int __sm_ll_inc(struct ll_disk *ll, dm_block_t b, dm_block_t e, + int32_t *nr_allocations, dm_block_t *new_b) { - return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev); + int r; + struct inc_context ic; + uint32_t bit, bit_end; + dm_block_t index = b; + + init_inc_context(&ic); + + bit = do_div(index, ll->entries_per_block); + r = ll->load_ie(ll, index, &ic.ie_disk); + if (r < 0) + return r; + + r = shadow_bitmap(ll, &ic); + if (r) + return r; + + bit_end = min(bit + (e - b), (dm_block_t) ll->entries_per_block); + r = sm_ll_inc_bitmap(ll, b, bit, bit_end, nr_allocations, new_b, &ic); + + exit_inc_context(ll, &ic); + + if (r) + return r; + + return ll->save_ie(ll, index, &ic.ie_disk); } -static int inc_ref_count(void *context, uint32_t old, uint32_t *new) +int sm_ll_inc(struct ll_disk *ll, dm_block_t b, dm_block_t e, + int32_t *nr_allocations) { - *new = old + 1; + *nr_allocations = 0; + while (b != e) { + int r = __sm_ll_inc(ll, b, e, nr_allocations, &b); + if (r) + return r; + } + return 0; } -int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +/*----------------------------------------------------------------*/ + +static int __sm_ll_del_overflow(struct ll_disk *ll, dm_block_t b, + struct inc_context *ic) { - return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev); + reset_inc_context(ll, ic); + return dm_btree_remove(&ll->ref_count_info, ll->ref_count_root, + &b, &ll->ref_count_root); } -static int dec_ref_count(void *context, uint32_t old, uint32_t *new) +static int __sm_ll_dec_overflow(struct ll_disk *ll, dm_block_t b, + struct inc_context *ic, uint32_t *old_rc) { - if (!old) { - DMERR_LIMIT("unable to decrement a reference count below 0"); + int r; + int index = -1; + struct btree_node *n; + __le32 *v_ptr; + uint32_t rc; + + reset_inc_context(ll, ic); + r = btree_get_overwrite_leaf(&ll->ref_count_info, ll->ref_count_root, + b, &index, &ll->ref_count_root, &ic->overflow_leaf); + if (r < 0) + return r; + + n = dm_block_data(ic->overflow_leaf); + + if (!contains_key(n, b, index)) { + DMERR("overflow btree is missing an entry"); return -EINVAL; } - *new = old - 1; + v_ptr = value_ptr(n, index); + rc = le32_to_cpu(*v_ptr); + *old_rc = rc; + + if (rc == 3) { + return __sm_ll_del_overflow(ll, b, ic); + } else { + rc--; + *v_ptr = cpu_to_le32(rc); + return 0; + } +} + +static int sm_ll_dec_overflow(struct ll_disk *ll, dm_block_t b, + struct inc_context *ic, uint32_t *old_rc) +{ + /* + * Do we already have the correct overflow leaf? + */ + if (ic->overflow_leaf) { + int index; + struct btree_node *n; + __le32 *v_ptr; + uint32_t rc; + + n = dm_block_data(ic->overflow_leaf); + index = lower_bound(n, b); + if (contains_key(n, b, index)) { + v_ptr = value_ptr(n, index); + rc = le32_to_cpu(*v_ptr); + *old_rc = rc; + + if (rc > 3) { + rc--; + *v_ptr = cpu_to_le32(rc); + return 0; + } else { + return __sm_ll_del_overflow(ll, b, ic); + } + + } + } + + return __sm_ll_dec_overflow(ll, b, ic, old_rc); +} + +/* + * Loops round incrementing entries in a single bitmap. + */ +static inline int sm_ll_dec_bitmap(struct ll_disk *ll, dm_block_t b, + uint32_t bit, uint32_t bit_end, + struct inc_context *ic, + int32_t *nr_allocations, dm_block_t *new_b) +{ + int r; + uint32_t old; + + for (; bit != bit_end; bit++, b++) { + /* + * We only need to drop the bitmap if we need to find a new btree + * leaf for the overflow. So if it was dropped last iteration, + * we now re-get it. + */ + r = ensure_bitmap(ll, ic); + if (r) + return r; + + old = sm_lookup_bitmap(ic->bitmap, bit); + switch (old) { + case 0: + DMERR("unable to decrement block"); + return -EINVAL; + + case 1: + /* dec bitmap */ + sm_set_bitmap(ic->bitmap, bit, 0); + (*nr_allocations)--; + ll->nr_allocated--; + le32_add_cpu(&ic->ie_disk.nr_free, 1); + ic->ie_disk.none_free_before = + cpu_to_le32(min(le32_to_cpu(ic->ie_disk.none_free_before), bit)); + break; + + case 2: + /* dec bitmap and insert into overflow */ + sm_set_bitmap(ic->bitmap, bit, 1); + break; + + case 3: + r = sm_ll_dec_overflow(ll, b, ic, &old); + if (r < 0) + return r; + + if (old == 3) { + r = ensure_bitmap(ll, ic); + if (r) + return r; + + sm_set_bitmap(ic->bitmap, bit, 2); + } + break; + } + } + + *new_b = b; return 0; } -int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev) +static int __sm_ll_dec(struct ll_disk *ll, dm_block_t b, dm_block_t e, + int32_t *nr_allocations, dm_block_t *new_b) +{ + int r; + uint32_t bit, bit_end; + struct inc_context ic; + dm_block_t index = b; + + init_inc_context(&ic); + + bit = do_div(index, ll->entries_per_block); + r = ll->load_ie(ll, index, &ic.ie_disk); + if (r < 0) + return r; + + r = shadow_bitmap(ll, &ic); + if (r) + return r; + + bit_end = min(bit + (e - b), (dm_block_t) ll->entries_per_block); + r = sm_ll_dec_bitmap(ll, b, bit, bit_end, &ic, nr_allocations, new_b); + exit_inc_context(ll, &ic); + + if (r) + return r; + + return ll->save_ie(ll, index, &ic.ie_disk); +} + +int sm_ll_dec(struct ll_disk *ll, dm_block_t b, dm_block_t e, + int32_t *nr_allocations) { - return sm_ll_mutate(ll, b, dec_ref_count, NULL, ev); + *nr_allocations = 0; + while (b != e) { + int r = __sm_ll_dec(ll, b, e, nr_allocations, &b); + if (r) + return r; + } + + return 0; } +/*----------------------------------------------------------------*/ + int sm_ll_commit(struct ll_disk *ll) { int r = 0; @@ -687,28 +1084,92 @@ int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm, /*----------------------------------------------------------------*/ +static inline int ie_cache_writeback(struct ll_disk *ll, struct ie_cache *iec) +{ + iec->dirty = false; + __dm_bless_for_disk(iec->ie); + return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root, + &iec->index, &iec->ie, &ll->bitmap_root); +} + +static inline unsigned hash_index(dm_block_t index) +{ + return dm_hash_block(index, IE_CACHE_MASK); +} + static int disk_ll_load_ie(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *ie) { - return dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie); + int r; + unsigned h = hash_index(index); + struct ie_cache *iec = ll->ie_cache + h; + + if (iec->valid) { + if (iec->index == index) { + memcpy(ie, &iec->ie, sizeof(*ie)); + return 0; + } + + if (iec->dirty) { + r = ie_cache_writeback(ll, iec); + if (r) + return r; + } + } + + r = dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie); + if (!r) { + iec->valid = true; + iec->dirty = false; + iec->index = index; + memcpy(&iec->ie, ie, sizeof(*ie)); + } + + return r; } static int disk_ll_save_ie(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *ie) { - __dm_bless_for_disk(ie); - return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root, - &index, ie, &ll->bitmap_root); + int r; + unsigned h = hash_index(index); + struct ie_cache *iec = ll->ie_cache + h; + + ll->bitmap_index_changed = true; + if (iec->valid) { + if (iec->index == index) { + memcpy(&iec->ie, ie, sizeof(*ie)); + iec->dirty = true; + return 0; + } + + if (iec->dirty) { + r = ie_cache_writeback(ll, iec); + if (r) + return r; + } + } + + iec->valid = true; + iec->dirty = true; + iec->index = index; + memcpy(&iec->ie, ie, sizeof(*ie)); + return 0; } static int disk_ll_init_index(struct ll_disk *ll) { + unsigned i; + for (i = 0; i < IE_CACHE_SIZE; i++) { + struct ie_cache *iec = ll->ie_cache + i; + iec->valid = false; + iec->dirty = false; + } return dm_btree_empty(&ll->bitmap_info, &ll->bitmap_root); } static int disk_ll_open(struct ll_disk *ll) { - /* nothing to do */ return 0; } @@ -719,7 +1180,16 @@ static dm_block_t disk_ll_max_entries(struct ll_disk *ll) static int disk_ll_commit(struct ll_disk *ll) { - return 0; + int r = 0; + unsigned i; + + for (i = 0; i < IE_CACHE_SIZE; i++) { + struct ie_cache *iec = ll->ie_cache + i; + if (iec->valid && iec->dirty) + r = ie_cache_writeback(ll, iec); + } + + return r; } int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm) diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h index 87e17909ef52..706ceb85d680 100644 --- a/drivers/md/persistent-data/dm-space-map-common.h +++ b/drivers/md/persistent-data/dm-space-map-common.h @@ -54,6 +54,20 @@ typedef int (*open_index_fn)(struct ll_disk *ll); typedef dm_block_t (*max_index_entries_fn)(struct ll_disk *ll); typedef int (*commit_fn)(struct ll_disk *ll); +/* + * A lot of time can be wasted reading and writing the same + * index entry. So we cache a few entries. + */ +#define IE_CACHE_SIZE 64 +#define IE_CACHE_MASK (IE_CACHE_SIZE - 1) + +struct ie_cache { + bool valid; + bool dirty; + dm_block_t index; + struct disk_index_entry ie; +}; + struct ll_disk { struct dm_transaction_manager *tm; struct dm_btree_info bitmap_info; @@ -79,6 +93,8 @@ struct ll_disk { max_index_entries_fn max_entries; commit_fn commit; bool bitmap_index_changed:1; + + struct ie_cache ie_cache[IE_CACHE_SIZE]; }; struct disk_sm_root { @@ -96,12 +112,6 @@ struct disk_bitmap_header { __le64 blocknr; } __attribute__ ((packed, aligned(8))); -enum allocation_event { - SM_NONE, - SM_ALLOC, - SM_FREE, -}; - /*----------------------------------------------------------------*/ int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks); @@ -111,9 +121,15 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, dm_block_t end, dm_block_t *result); int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll, dm_block_t begin, dm_block_t end, dm_block_t *result); -int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev); -int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); -int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); + +/* + * The next three functions return (via nr_allocations) the net number of + * allocations that were made. This number may be negative if there were + * more frees than allocs. + */ +int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, int32_t *nr_allocations); +int sm_ll_inc(struct ll_disk *ll, dm_block_t b, dm_block_t e, int32_t *nr_allocations); +int sm_ll_dec(struct ll_disk *ll, dm_block_t b, dm_block_t e, int32_t *nr_allocations); int sm_ll_commit(struct ll_disk *ll); int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm); diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index 61f56909e00b..d0a8d5e73c28 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -87,76 +87,39 @@ static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count) { int r; - uint32_t old_count; - enum allocation_event ev; + int32_t nr_allocations; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - r = sm_ll_insert(&smd->ll, b, count, &ev); + r = sm_ll_insert(&smd->ll, b, count, &nr_allocations); if (!r) { - switch (ev) { - case SM_NONE: - break; - - case SM_ALLOC: - /* - * This _must_ be free in the prior transaction - * otherwise we've lost atomicity. - */ - smd->nr_allocated_this_transaction++; - break; - - case SM_FREE: - /* - * It's only free if it's also free in the last - * transaction. - */ - r = sm_ll_lookup(&smd->old_ll, b, &old_count); - if (r) - return r; - - if (!old_count) - smd->nr_allocated_this_transaction--; - break; - } + smd->nr_allocated_this_transaction += nr_allocations; } return r; } -static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b) +static int sm_disk_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) { int r; - enum allocation_event ev; + int32_t nr_allocations; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - r = sm_ll_inc(&smd->ll, b, &ev); - if (!r && (ev == SM_ALLOC)) - /* - * This _must_ be free in the prior transaction - * otherwise we've lost atomicity. - */ - smd->nr_allocated_this_transaction++; + r = sm_ll_inc(&smd->ll, b, e, &nr_allocations); + if (!r) + smd->nr_allocated_this_transaction += nr_allocations; return r; } -static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) +static int sm_disk_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) { int r; - uint32_t old_count; - enum allocation_event ev; + int32_t nr_allocations; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - r = sm_ll_dec(&smd->ll, b, &ev); - if (!r && (ev == SM_FREE)) { - /* - * It's only free if it's also free in the last - * transaction. - */ - r = sm_ll_lookup(&smd->old_ll, b, &old_count); - if (!r && !old_count) - smd->nr_allocated_this_transaction--; - } + r = sm_ll_dec(&smd->ll, b, e, &nr_allocations); + if (!r) + smd->nr_allocated_this_transaction += nr_allocations; return r; } @@ -164,21 +127,28 @@ static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b) static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) { int r; - enum allocation_event ev; + int32_t nr_allocations; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); /* * Any block we allocate has to be free in both the old and current ll. */ r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b); + if (r == -ENOSPC) { + /* + * There's no free block between smd->begin and the end of the metadata device. + * We search before smd->begin in case something has been freed. + */ + r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, 0, smd->begin, b); + } + if (r) return r; smd->begin = *b + 1; - r = sm_ll_inc(&smd->ll, *b, &ev); + r = sm_ll_inc(&smd->ll, *b, *b + 1, &nr_allocations); if (!r) { - BUG_ON(ev != SM_ALLOC); - smd->nr_allocated_this_transaction++; + smd->nr_allocated_this_transaction += nr_allocations; } return r; @@ -194,7 +164,6 @@ static int sm_disk_commit(struct dm_space_map *sm) return r; memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll)); - smd->begin = 0; smd->nr_allocated_this_transaction = 0; return 0; @@ -235,8 +204,8 @@ static struct dm_space_map ops = { .get_count = sm_disk_get_count, .count_is_more_than_one = sm_disk_count_is_more_than_one, .set_count = sm_disk_set_count, - .inc_block = sm_disk_inc_block, - .dec_block = sm_disk_dec_block, + .inc_blocks = sm_disk_inc_blocks, + .dec_blocks = sm_disk_dec_blocks, .new_block = sm_disk_new_block, .commit = sm_disk_commit, .root_size = sm_disk_root_size, diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 9e3c64ec2026..392ae26134a4 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -89,7 +89,8 @@ enum block_op_type { struct block_op { enum block_op_type type; - dm_block_t block; + dm_block_t b; + dm_block_t e; }; struct bop_ring_buffer { @@ -116,7 +117,7 @@ static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old) } static int brb_push(struct bop_ring_buffer *brb, - enum block_op_type type, dm_block_t b) + enum block_op_type type, dm_block_t b, dm_block_t e) { struct block_op *bop; unsigned next = brb_next(brb, brb->end); @@ -130,7 +131,8 @@ static int brb_push(struct bop_ring_buffer *brb, bop = brb->bops + brb->end; bop->type = type; - bop->block = b; + bop->b = b; + bop->e = e; brb->end = next; @@ -145,9 +147,7 @@ static int brb_peek(struct bop_ring_buffer *brb, struct block_op *result) return -ENODATA; bop = brb->bops + brb->begin; - result->type = bop->type; - result->block = bop->block; - + memcpy(result, bop, sizeof(*result)); return 0; } @@ -178,10 +178,9 @@ struct sm_metadata { struct threshold threshold; }; -static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) +static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b, dm_block_t e) { - int r = brb_push(&smm->uncommitted, type, b); - + int r = brb_push(&smm->uncommitted, type, b, e); if (r) { DMERR("too many recursive allocations"); return -ENOMEM; @@ -193,15 +192,15 @@ static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t static int commit_bop(struct sm_metadata *smm, struct block_op *op) { int r = 0; - enum allocation_event ev; + int32_t nr_allocations; switch (op->type) { case BOP_INC: - r = sm_ll_inc(&smm->ll, op->block, &ev); + r = sm_ll_inc(&smm->ll, op->b, op->e, &nr_allocations); break; case BOP_DEC: - r = sm_ll_dec(&smm->ll, op->block, &ev); + r = sm_ll_dec(&smm->ll, op->b, op->e, &nr_allocations); break; } @@ -314,7 +313,7 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, i = brb_next(&smm->uncommitted, i)) { struct block_op *op = smm->uncommitted.bops + i; - if (op->block != b) + if (b < op->b || b >= op->e) continue; switch (op->type) { @@ -355,7 +354,7 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, struct block_op *op = smm->uncommitted.bops + i; - if (op->block != b) + if (b < op->b || b >= op->e) continue; switch (op->type) { @@ -393,7 +392,7 @@ static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count) { int r, r2; - enum allocation_event ev; + int32_t nr_allocations; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); if (smm->recursion_count) { @@ -402,40 +401,42 @@ static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b, } in(smm); - r = sm_ll_insert(&smm->ll, b, count, &ev); + r = sm_ll_insert(&smm->ll, b, count, &nr_allocations); r2 = out(smm); return combine_errors(r, r2); } -static int sm_metadata_inc_block(struct dm_space_map *sm, dm_block_t b) +static int sm_metadata_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) { int r, r2 = 0; - enum allocation_event ev; + int32_t nr_allocations; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - if (recursing(smm)) - r = add_bop(smm, BOP_INC, b); - else { + if (recursing(smm)) { + r = add_bop(smm, BOP_INC, b, e); + if (r) + return r; + } else { in(smm); - r = sm_ll_inc(&smm->ll, b, &ev); + r = sm_ll_inc(&smm->ll, b, e, &nr_allocations); r2 = out(smm); } return combine_errors(r, r2); } -static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b) +static int sm_metadata_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) { int r, r2 = 0; - enum allocation_event ev; + int32_t nr_allocations; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); if (recursing(smm)) - r = add_bop(smm, BOP_DEC, b); + r = add_bop(smm, BOP_DEC, b, e); else { in(smm); - r = sm_ll_dec(&smm->ll, b, &ev); + r = sm_ll_dec(&smm->ll, b, e, &nr_allocations); r2 = out(smm); } @@ -445,23 +446,31 @@ static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b) static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b) { int r, r2 = 0; - enum allocation_event ev; + int32_t nr_allocations; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); /* * Any block we allocate has to be free in both the old and current ll. */ r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b); + if (r == -ENOSPC) { + /* + * There's no free block between smm->begin and the end of the metadata device. + * We search before smm->begin in case something has been freed. + */ + r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, 0, smm->begin, b); + } + if (r) return r; smm->begin = *b + 1; if (recursing(smm)) - r = add_bop(smm, BOP_INC, *b); + r = add_bop(smm, BOP_INC, *b, *b + 1); else { in(smm); - r = sm_ll_inc(&smm->ll, *b, &ev); + r = sm_ll_inc(&smm->ll, *b, *b + 1, &nr_allocations); r2 = out(smm); } @@ -503,7 +512,6 @@ static int sm_metadata_commit(struct dm_space_map *sm) return r; memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); - smm->begin = 0; smm->allocated_this_transaction = 0; return 0; @@ -556,8 +564,8 @@ static const struct dm_space_map ops = { .get_count = sm_metadata_get_count, .count_is_more_than_one = sm_metadata_count_is_more_than_one, .set_count = sm_metadata_set_count, - .inc_block = sm_metadata_inc_block, - .dec_block = sm_metadata_dec_block, + .inc_blocks = sm_metadata_inc_blocks, + .dec_blocks = sm_metadata_dec_blocks, .new_block = sm_metadata_new_block, .commit = sm_metadata_commit, .root_size = sm_metadata_root_size, @@ -641,18 +649,28 @@ static int sm_bootstrap_new_block(struct dm_space_map *sm, dm_block_t *b) return 0; } -static int sm_bootstrap_inc_block(struct dm_space_map *sm, dm_block_t b) +static int sm_bootstrap_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) { + int r; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - return add_bop(smm, BOP_INC, b); + r = add_bop(smm, BOP_INC, b, e); + if (r) + return r; + + return 0; } -static int sm_bootstrap_dec_block(struct dm_space_map *sm, dm_block_t b) +static int sm_bootstrap_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) { + int r; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - return add_bop(smm, BOP_DEC, b); + r = add_bop(smm, BOP_DEC, b, e); + if (r) + return r; + + return 0; } static int sm_bootstrap_commit(struct dm_space_map *sm) @@ -683,8 +701,8 @@ static const struct dm_space_map bootstrap_ops = { .get_count = sm_bootstrap_get_count, .count_is_more_than_one = sm_bootstrap_count_is_more_than_one, .set_count = sm_bootstrap_set_count, - .inc_block = sm_bootstrap_inc_block, - .dec_block = sm_bootstrap_dec_block, + .inc_blocks = sm_bootstrap_inc_blocks, + .dec_blocks = sm_bootstrap_dec_blocks, .new_block = sm_bootstrap_new_block, .commit = sm_bootstrap_commit, .root_size = sm_bootstrap_root_size, @@ -696,7 +714,7 @@ static const struct dm_space_map bootstrap_ops = { static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) { - int r, i; + int r; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); dm_block_t old_len = smm->ll.nr_blocks; @@ -718,9 +736,7 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks) * allocate any new blocks. */ do { - for (i = old_len; !r && i < smm->begin; i++) - r = add_bop(smm, BOP_INC, i); - + r = add_bop(smm, BOP_INC, old_len, smm->begin); if (r) goto out; @@ -767,7 +783,6 @@ int dm_sm_metadata_create(struct dm_space_map *sm, dm_block_t superblock) { int r; - dm_block_t i; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); smm->begin = superblock + 1; @@ -792,9 +807,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm, * Now we need to update the newly created data structures with the * allocated blocks that they were built from. */ - for (i = superblock; !r && i < smm->begin; i++) - r = add_bop(smm, BOP_INC, i); - + r = add_bop(smm, BOP_INC, superblock, smm->begin); if (r) return r; diff --git a/drivers/md/persistent-data/dm-space-map.h b/drivers/md/persistent-data/dm-space-map.h index 3e6d1153b7c4..a015cd11f6e9 100644 --- a/drivers/md/persistent-data/dm-space-map.h +++ b/drivers/md/persistent-data/dm-space-map.h @@ -46,8 +46,8 @@ struct dm_space_map { int (*commit)(struct dm_space_map *sm); - int (*inc_block)(struct dm_space_map *sm, dm_block_t b); - int (*dec_block)(struct dm_space_map *sm, dm_block_t b); + int (*inc_blocks)(struct dm_space_map *sm, dm_block_t b, dm_block_t e); + int (*dec_blocks)(struct dm_space_map *sm, dm_block_t b, dm_block_t e); /* * new_block will increment the returned block. @@ -117,14 +117,24 @@ static inline int dm_sm_commit(struct dm_space_map *sm) return sm->commit(sm); } +static inline int dm_sm_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) +{ + return sm->inc_blocks(sm, b, e); +} + static inline int dm_sm_inc_block(struct dm_space_map *sm, dm_block_t b) { - return sm->inc_block(sm, b); + return dm_sm_inc_blocks(sm, b, b + 1); +} + +static inline int dm_sm_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e) +{ + return sm->dec_blocks(sm, b, e); } static inline int dm_sm_dec_block(struct dm_space_map *sm, dm_block_t b) { - return sm->dec_block(sm, b); + return dm_sm_dec_blocks(sm, b, b + 1); } static inline int dm_sm_new_block(struct dm_space_map *sm, dm_block_t *b) diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c index abe2c5dd0993..16643fc974e8 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.c +++ b/drivers/md/persistent-data/dm-transaction-manager.c @@ -359,6 +359,17 @@ void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b) } EXPORT_SYMBOL_GPL(dm_tm_inc); +void dm_tm_inc_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e) +{ + /* + * The non-blocking clone doesn't support this. + */ + BUG_ON(tm->is_clone); + + dm_sm_inc_blocks(tm->sm, b, e); +} +EXPORT_SYMBOL_GPL(dm_tm_inc_range); + void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b) { /* @@ -370,6 +381,47 @@ void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b) } EXPORT_SYMBOL_GPL(dm_tm_dec); +void dm_tm_dec_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e) +{ + /* + * The non-blocking clone doesn't support this. + */ + BUG_ON(tm->is_clone); + + dm_sm_dec_blocks(tm->sm, b, e); +} +EXPORT_SYMBOL_GPL(dm_tm_dec_range); + +void dm_tm_with_runs(struct dm_transaction_manager *tm, + const __le64 *value_le, unsigned count, dm_tm_run_fn fn) +{ + uint64_t b, begin, end; + bool in_run = false; + unsigned i; + + for (i = 0; i < count; i++, value_le++) { + b = le64_to_cpu(*value_le); + + if (in_run) { + if (b == end) + end++; + else { + fn(tm, begin, end); + begin = b; + end = b + 1; + } + } else { + in_run = true; + begin = b; + end = b + 1; + } + } + + if (in_run) + fn(tm, begin, end); +} +EXPORT_SYMBOL_GPL(dm_tm_with_runs); + int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, uint32_t *result) { @@ -379,6 +431,15 @@ int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, return dm_sm_get_count(tm->sm, b, result); } +int dm_tm_block_is_shared(struct dm_transaction_manager *tm, dm_block_t b, + int *result) +{ + if (tm->is_clone) + return -EWOULDBLOCK; + + return dm_sm_count_is_more_than_one(tm->sm, b, result); +} + struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm) { return tm->bm; diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h index f3a18be68f30..906c02ed0365 100644 --- a/drivers/md/persistent-data/dm-transaction-manager.h +++ b/drivers/md/persistent-data/dm-transaction-manager.h @@ -100,11 +100,27 @@ void dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b); * Functions for altering the reference count of a block directly. */ void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b); - +void dm_tm_inc_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e); void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b); +void dm_tm_dec_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e); + +/* + * Builds up runs of adjacent blocks, and then calls the given fn + * (typically dm_tm_inc/dec). Very useful when you have to perform + * the same tm operation on all values in a btree leaf. + */ +typedef void (*dm_tm_run_fn)(struct dm_transaction_manager *, dm_block_t, dm_block_t); +void dm_tm_with_runs(struct dm_transaction_manager *tm, + const __le64 *value_le, unsigned count, dm_tm_run_fn fn); -int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, - uint32_t *result); +int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, uint32_t *result); + +/* + * Finds out if a given block is shared (ie. has a reference count higher + * than one). + */ +int dm_tm_block_is_shared(struct dm_transaction_manager *tm, dm_block_t b, + int *result); struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm); |