aboutsummaryrefslogtreecommitdiff
path: root/fs/ext4/mballoc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/mballoc.c')
-rw-r--r--fs/ext4/mballoc.c775
1 files changed, 507 insertions, 268 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 5b2ae37a8b80..20f67a260df5 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -745,6 +745,8 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
grp = ext4_get_group_info(sb, e4b->bd_group);
+ if (!grp)
+ return NULL;
list_for_each(cur, &grp->bb_prealloc_list) {
ext4_group_t groupnr;
struct ext4_prealloc_space *pa;
@@ -1060,9 +1062,9 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
static noinline_for_stack
void ext4_mb_generate_buddy(struct super_block *sb,
- void *buddy, void *bitmap, ext4_group_t group)
+ void *buddy, void *bitmap, ext4_group_t group,
+ struct ext4_group_info *grp)
{
- struct ext4_group_info *grp = ext4_get_group_info(sb, group);
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
ext4_grpblk_t i = 0;
@@ -1168,10 +1170,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
if (groups_per_page > 1) {
i = sizeof(struct buffer_head *) * groups_per_page;
bh = kzalloc(i, gfp);
- if (bh == NULL) {
- err = -ENOMEM;
- goto out;
- }
+ if (bh == NULL)
+ return -ENOMEM;
} else
bh = &bhs;
@@ -1183,6 +1183,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
break;
grinfo = ext4_get_group_info(sb, group);
+ if (!grinfo)
+ continue;
/*
* If page is uptodate then we came here after online resize
* which added some new uninitialized group info structs, so
@@ -1248,6 +1250,10 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
group, page->index, i * blocksize);
trace_ext4_mb_buddy_bitmap_load(sb, group);
grinfo = ext4_get_group_info(sb, group);
+ if (!grinfo) {
+ err = -EFSCORRUPTED;
+ goto out;
+ }
grinfo->bb_fragments = 0;
memset(grinfo->bb_counters, 0,
sizeof(*grinfo->bb_counters) *
@@ -1258,7 +1264,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
ext4_lock_group(sb, group);
/* init the buddy */
memset(data, 0xff, blocksize);
- ext4_mb_generate_buddy(sb, data, incore, group);
+ ext4_mb_generate_buddy(sb, data, incore, group, grinfo);
ext4_unlock_group(sb, group);
incore = NULL;
} else {
@@ -1372,6 +1378,9 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
might_sleep();
mb_debug(sb, "init group %u\n", group);
this_grp = ext4_get_group_info(sb, group);
+ if (!this_grp)
+ return -EFSCORRUPTED;
+
/*
* This ensures that we don't reinit the buddy cache
* page which map to the group from which we are already
@@ -1446,6 +1455,8 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
blocks_per_page = PAGE_SIZE / sb->s_blocksize;
grp = ext4_get_group_info(sb, group);
+ if (!grp)
+ return -EFSCORRUPTED;
e4b->bd_blkbits = sb->s_blocksize_bits;
e4b->bd_info = grp;
@@ -1489,7 +1500,13 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
put_page(page);
page = find_or_create_page(inode->i_mapping, pnum, gfp);
if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
+ if (WARN_RATELIMIT(page->mapping != inode->i_mapping,
+ "ext4: bitmap's paging->mapping != inode->i_mapping\n")) {
+ /* should never happen */
+ unlock_page(page);
+ ret = -EINVAL;
+ goto err;
+ }
if (!PageUptodate(page)) {
ret = ext4_mb_init_cache(page, NULL, gfp);
if (ret) {
@@ -1525,7 +1542,13 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
put_page(page);
page = find_or_create_page(inode->i_mapping, pnum, gfp);
if (page) {
- BUG_ON(page->mapping != inode->i_mapping);
+ if (WARN_RATELIMIT(page->mapping != inode->i_mapping,
+ "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) {
+ /* should never happen */
+ unlock_page(page);
+ ret = -EINVAL;
+ goto err;
+ }
if (!PageUptodate(page)) {
ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
gfp);
@@ -1557,8 +1580,7 @@ err:
put_page(page);
if (e4b->bd_bitmap_page)
put_page(e4b->bd_bitmap_page);
- if (e4b->bd_buddy_page)
- put_page(e4b->bd_buddy_page);
+
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
return ret;
@@ -1721,7 +1743,8 @@ static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
break;
order++;
- if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
+ buddy2 = mb_find_buddy(e4b, order, &max);
+ if (!buddy2) {
mb_clear_bits(buddy, first, last - first + 1);
e4b->bd_info->bb_counters[order - 1] += last - first + 1;
break;
@@ -2021,8 +2044,6 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_free_extent *bex = &ac->ac_b_ex;
struct ext4_free_extent *gex = &ac->ac_g_ex;
- struct ext4_free_extent ex;
- int max;
if (ac->ac_status == AC_STATUS_FOUND)
return;
@@ -2041,17 +2062,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
if (bex->fe_len < gex->fe_len)
return;
- if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
- && bex->fe_group == e4b->bd_group) {
- /* recheck chunk's availability - we don't know
- * when it was found (within this lock-unlock
- * period or not) */
- max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
- if (max >= gex->fe_len) {
- ext4_mb_use_best_found(ac, e4b);
- return;
- }
- }
+ if (finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
+ ext4_mb_use_best_found(ac, e4b);
}
/*
@@ -2062,6 +2074,20 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
* in the context. Later, the best found extent will be used, if
* mballoc can't find good enough extent.
*
+ * The algorithm used is roughly as follows:
+ *
+ * * If free extent found is exactly as big as goal, then
+ * stop the scan and use it immediately
+ *
+ * * If free extent found is smaller than goal, then keep retrying
+ * upto a max of sbi->s_mb_max_to_scan times (default 200). After
+ * that stop scanning and use whatever we have.
+ *
+ * * If free extent found is bigger than goal, then keep retrying
+ * upto a max of sbi->s_mb_min_to_scan times (default 10) before
+ * stopping the scan and using the extent.
+ *
+ *
* FIXME: real allocation policy is to be designed yet!
*/
static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
@@ -2124,7 +2150,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
}
static noinline_for_stack
-int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
+void ext4_mb_try_best_found(struct ext4_allocation_context *ac,
struct ext4_buddy *e4b)
{
struct ext4_free_extent ex = ac->ac_b_ex;
@@ -2135,7 +2161,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
BUG_ON(ex.fe_len <= 0);
err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
if (err)
- return err;
+ return;
ext4_lock_group(ac->ac_sb, group);
max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
@@ -2147,8 +2173,6 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
ext4_unlock_group(ac->ac_sb, group);
ext4_mb_unload_buddy(e4b);
-
- return 0;
}
static noinline_for_stack
@@ -2162,7 +2186,9 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
struct ext4_free_extent ex;
- if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
+ if (!grp)
+ return -EFSCORRUPTED;
+ if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY)))
return 0;
if (grp->bb_free == 0)
return 0;
@@ -2236,7 +2262,9 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
continue;
buddy = mb_find_buddy(e4b, i, &max);
- BUG_ON(buddy == NULL);
+ if (WARN_RATELIMIT(buddy == NULL,
+ "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i))
+ continue;
k = mb_find_next_zero_bit(buddy, max, 0);
if (k >= max) {
@@ -2386,7 +2414,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
BUG_ON(cr < 0 || cr >= 4);
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp))
return false;
free = grp->bb_free;
@@ -2455,6 +2483,8 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
ext4_grpblk_t free;
int ret = 0;
+ if (!grp)
+ return -EFSCORRUPTED;
if (sbi->s_mb_stats)
atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
if (should_lock) {
@@ -2535,7 +2565,7 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
* prefetch once, so we avoid getblk() call, which can
* be expensive.
*/
- if (!EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
+ if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
EXT4_MB_GRP_NEED_INIT(grp) &&
ext4_free_group_clusters(sb, gdp) > 0 &&
!(ext4_has_group_desc_csum(sb) &&
@@ -2569,17 +2599,17 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
unsigned int nr)
{
- while (nr-- > 0) {
- struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
- NULL);
- struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+ struct ext4_group_desc *gdp;
+ struct ext4_group_info *grp;
+ while (nr-- > 0) {
if (!group)
group = ext4_get_groups_count(sb);
group--;
+ gdp = ext4_get_group_desc(sb, group, NULL);
grp = ext4_get_group_info(sb, group);
- if (EXT4_MB_GRP_NEED_INIT(grp) &&
+ if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) &&
ext4_free_group_clusters(sb, gdp) > 0 &&
!(ext4_has_group_desc_csum(sb) &&
(gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
@@ -2838,6 +2868,8 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
sizeof(struct ext4_group_info);
grinfo = ext4_get_group_info(sb, group);
+ if (!grinfo)
+ return 0;
/* Load the group info in memory only if not already loaded. */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2848,7 +2880,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
buddy_loaded = 1;
}
- memcpy(&sg, ext4_get_group_info(sb, group), i);
+ memcpy(&sg, grinfo, i);
if (buddy_loaded)
ext4_mb_unload_buddy(&e4b);
@@ -3084,7 +3116,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
if (meta_group_info == NULL) {
ext4_msg(sb, KERN_ERR, "can't allocate mem "
"for a buddy group");
- goto exit_meta_group_info;
+ return -ENOMEM;
}
rcu_read_lock();
rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
@@ -3138,7 +3170,6 @@ exit_group_info:
group_info[idx] = NULL;
rcu_read_unlock();
}
-exit_meta_group_info:
return -ENOMEM;
} /* ext4_mb_add_groupinfo */
@@ -3210,8 +3241,12 @@ static int ext4_mb_init_backend(struct super_block *sb)
err_freebuddy:
cachep = get_groupinfo_cache(sb->s_blocksize_bits);
- while (i-- > 0)
- kmem_cache_free(cachep, ext4_get_group_info(sb, i));
+ while (i-- > 0) {
+ struct ext4_group_info *grp = ext4_get_group_info(sb, i);
+
+ if (grp)
+ kmem_cache_free(cachep, grp);
+ }
i = sbi->s_group_info_size;
rcu_read_lock();
group_info = rcu_dereference(sbi->s_group_info);
@@ -3419,7 +3454,6 @@ int ext4_mb_init(struct super_block *sb)
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
- sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC;
/*
* The default group preallocation is 512, which for 4k block
* sizes translates to 2 megabytes. However for bigalloc file
@@ -3525,6 +3559,8 @@ int ext4_mb_release(struct super_block *sb)
for (i = 0; i < ngroups; i++) {
cond_resched();
grinfo = ext4_get_group_info(sb, i);
+ if (!grinfo)
+ continue;
mb_group_bb_bitmap_free(grinfo);
ext4_lock_group(sb, i);
count = ext4_mb_cleanup_pa(grinfo);
@@ -3606,7 +3642,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
{
struct ext4_buddy e4b;
struct ext4_group_info *db;
- int err, count = 0, count2 = 0;
+ int err, count = 0;
mb_debug(sb, "gonna free %u blocks in group %u (0x%p):",
entry->efd_count, entry->efd_group, entry);
@@ -3622,7 +3658,6 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
db = e4b.bd_info;
/* there are blocks to put in buddy to make them really free */
count += entry->efd_count;
- count2++;
ext4_lock_group(sb, entry->efd_group);
/* Take it out of per group rb tree */
rb_erase(&entry->efd_node, &(db->bb_free_root));
@@ -3647,8 +3682,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
ext4_unlock_group(sb, entry->efd_group);
ext4_mb_unload_buddy(&e4b);
- mb_debug(sb, "freed %d blocks in %d structures\n", count,
- count2);
+ mb_debug(sb, "freed %d blocks in 1 structures\n", count);
}
/*
@@ -3757,9 +3791,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
if (IS_ERR(bitmap_bh)) {
- err = PTR_ERR(bitmap_bh);
- bitmap_bh = NULL;
- goto out_err;
+ return PTR_ERR(bitmap_bh);
}
BUFFER_TRACE(bitmap_bh, "getting write access");
@@ -3822,7 +3854,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
}
len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
ext4_free_group_clusters_set(sb, gdp, len);
- ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
+ ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
@@ -3929,7 +3961,7 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
clen = ext4_free_group_clusters(sb, gdp) + clen_changed;
ext4_free_group_clusters_set(sb, gdp, clen);
- ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
+ ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
ext4_group_desc_csum_set(sb, group, gdp);
ext4_unlock_group(sb, group);
@@ -3985,6 +4017,197 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
}
/*
+ * This function returns the next element to look at during inode
+ * PA rbtree walk. We assume that we have held the inode PA rbtree lock
+ * (ei->i_prealloc_lock)
+ *
+ * new_start The start of the range we want to compare
+ * cur_start The existing start that we are comparing against
+ * node The node of the rb_tree
+ */
+static inline struct rb_node*
+ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node)
+{
+ if (new_start < cur_start)
+ return node->rb_left;
+ else
+ return node->rb_right;
+}
+
+static inline void
+ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac,
+ ext4_lblk_t start, ext4_lblk_t end)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+ struct ext4_prealloc_space *tmp_pa;
+ ext4_lblk_t tmp_pa_start, tmp_pa_end;
+ struct rb_node *iter;
+
+ read_lock(&ei->i_prealloc_lock);
+ for (iter = ei->i_prealloc_node.rb_node; iter;
+ iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) {
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ tmp_pa_start = tmp_pa->pa_lstart;
+ tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
+
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted == 0)
+ BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start));
+ spin_unlock(&tmp_pa->pa_lock);
+ }
+ read_unlock(&ei->i_prealloc_lock);
+}
+
+/*
+ * Given an allocation context "ac" and a range "start", "end", check
+ * and adjust boundaries if the range overlaps with any of the existing
+ * preallocatoins stored in the corresponding inode of the allocation context.
+ *
+ * Parameters:
+ * ac allocation context
+ * start start of the new range
+ * end end of the new range
+ */
+static inline void
+ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac,
+ ext4_lblk_t *start, ext4_lblk_t *end)
+{
+ struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL;
+ struct rb_node *iter;
+ ext4_lblk_t new_start, new_end;
+ ext4_lblk_t tmp_pa_start, tmp_pa_end, left_pa_end = -1, right_pa_start = -1;
+
+ new_start = *start;
+ new_end = *end;
+
+ /*
+ * Adjust the normalized range so that it doesn't overlap with any
+ * existing preallocated blocks(PAs). Make sure to hold the rbtree lock
+ * so it doesn't change underneath us.
+ */
+ read_lock(&ei->i_prealloc_lock);
+
+ /* Step 1: find any one immediate neighboring PA of the normalized range */
+ for (iter = ei->i_prealloc_node.rb_node; iter;
+ iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
+ tmp_pa_start, iter)) {
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ tmp_pa_start = tmp_pa->pa_lstart;
+ tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
+
+ /* PA must not overlap original request */
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted == 0)
+ BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end ||
+ ac->ac_o_ex.fe_logical < tmp_pa_start));
+ spin_unlock(&tmp_pa->pa_lock);
+ }
+
+ /*
+ * Step 2: check if the found PA is left or right neighbor and
+ * get the other neighbor
+ */
+ if (tmp_pa) {
+ if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) {
+ struct rb_node *tmp;
+
+ left_pa = tmp_pa;
+ tmp = rb_next(&left_pa->pa_node.inode_node);
+ if (tmp) {
+ right_pa = rb_entry(tmp,
+ struct ext4_prealloc_space,
+ pa_node.inode_node);
+ }
+ } else {
+ struct rb_node *tmp;
+
+ right_pa = tmp_pa;
+ tmp = rb_prev(&right_pa->pa_node.inode_node);
+ if (tmp) {
+ left_pa = rb_entry(tmp,
+ struct ext4_prealloc_space,
+ pa_node.inode_node);
+ }
+ }
+ }
+
+ /* Step 3: get the non deleted neighbors */
+ if (left_pa) {
+ for (iter = &left_pa->pa_node.inode_node;;
+ iter = rb_prev(iter)) {
+ if (!iter) {
+ left_pa = NULL;
+ break;
+ }
+
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ left_pa = tmp_pa;
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted == 0) {
+ spin_unlock(&tmp_pa->pa_lock);
+ break;
+ }
+ spin_unlock(&tmp_pa->pa_lock);
+ }
+ }
+
+ if (right_pa) {
+ for (iter = &right_pa->pa_node.inode_node;;
+ iter = rb_next(iter)) {
+ if (!iter) {
+ right_pa = NULL;
+ break;
+ }
+
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ right_pa = tmp_pa;
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted == 0) {
+ spin_unlock(&tmp_pa->pa_lock);
+ break;
+ }
+ spin_unlock(&tmp_pa->pa_lock);
+ }
+ }
+
+ if (left_pa) {
+ left_pa_end =
+ left_pa->pa_lstart + EXT4_C2B(sbi, left_pa->pa_len);
+ BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical);
+ }
+
+ if (right_pa) {
+ right_pa_start = right_pa->pa_lstart;
+ BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical);
+ }
+
+ /* Step 4: trim our normalized range to not overlap with the neighbors */
+ if (left_pa) {
+ if (left_pa_end > new_start)
+ new_start = left_pa_end;
+ }
+
+ if (right_pa) {
+ if (right_pa_start < new_end)
+ new_end = right_pa_start;
+ }
+ read_unlock(&ei->i_prealloc_lock);
+
+ /* XXX: extra loop to check we really don't overlap preallocations */
+ ext4_mb_pa_assert_overlap(ac, new_start, new_end);
+
+ *start = new_start;
+ *end = new_end;
+}
+
+/*
* Normalization means making request better in terms of
* size and alignment
*/
@@ -3993,13 +4216,12 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
struct ext4_allocation_request *ar)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_super_block *es = sbi->s_es;
int bsbits, max;
ext4_lblk_t end;
loff_t size, start_off;
loff_t orig_size __maybe_unused;
ext4_lblk_t start;
- struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
- struct ext4_prealloc_space *pa;
/* do normalize only data requests, metadata requests
do not need preallocation */
@@ -4068,7 +4290,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
size = 8 * 1024 * 1024;
} else {
start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
- size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
+ size = (loff_t) EXT4_C2B(sbi,
ac->ac_o_ex.fe_len) << bsbits;
}
size = size >> bsbits;
@@ -4100,61 +4322,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
end = start + size;
- /* check we don't cross already preallocated blocks */
- rcu_read_lock();
- list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
- ext4_lblk_t pa_end;
-
- if (pa->pa_deleted)
- continue;
- spin_lock(&pa->pa_lock);
- if (pa->pa_deleted) {
- spin_unlock(&pa->pa_lock);
- continue;
- }
-
- pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
- pa->pa_len);
-
- /* PA must not overlap original request */
- BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
- ac->ac_o_ex.fe_logical < pa->pa_lstart));
+ ext4_mb_pa_adjust_overlap(ac, &start, &end);
- /* skip PAs this normalized request doesn't overlap with */
- if (pa->pa_lstart >= end || pa_end <= start) {
- spin_unlock(&pa->pa_lock);
- continue;
- }
- BUG_ON(pa->pa_lstart <= start && pa_end >= end);
-
- /* adjust start or end to be adjacent to this pa */
- if (pa_end <= ac->ac_o_ex.fe_logical) {
- BUG_ON(pa_end < start);
- start = pa_end;
- } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
- BUG_ON(pa->pa_lstart > end);
- end = pa->pa_lstart;
- }
- spin_unlock(&pa->pa_lock);
- }
- rcu_read_unlock();
size = end - start;
- /* XXX: extra loop to check we really don't overlap preallocations */
- rcu_read_lock();
- list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
- ext4_lblk_t pa_end;
-
- spin_lock(&pa->pa_lock);
- if (pa->pa_deleted == 0) {
- pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
- pa->pa_len);
- BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
- }
- spin_unlock(&pa->pa_lock);
- }
- rcu_read_unlock();
-
/*
* In this function "start" and "size" are normalized for better
* alignment and length such that we could preallocate more blocks.
@@ -4165,7 +4336,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
* provide gurantee on number of contiguous blocks allocation since that
* depends upon free space left, etc).
* In case of inode pa, later we use the allocated blocks
- * [pa_start + fe_logical - pa_lstart, fe_len/size] from the preallocated
+ * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated
* range of goal/best blocks [start, size] to put it at the
* ac_o_ex.fe_logical extent of this inode.
* (See ext4_mb_use_inode_pa() for more details)
@@ -4188,18 +4359,21 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
/* define goal start in order to merge */
- if (ar->pright && (ar->lright == (start + size))) {
+ if (ar->pright && (ar->lright == (start + size)) &&
+ ar->pright >= size &&
+ ar->pright - size >= le32_to_cpu(es->s_first_data_block)) {
/* merge to the right */
ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
- &ac->ac_f_ex.fe_group,
- &ac->ac_f_ex.fe_start);
+ &ac->ac_g_ex.fe_group,
+ &ac->ac_g_ex.fe_start);
ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
}
- if (ar->pleft && (ar->lleft + 1 == start)) {
+ if (ar->pleft && (ar->lleft + 1 == start) &&
+ ar->pleft + 1 < ext4_blocks_count(es)) {
/* merge to the left */
ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
- &ac->ac_f_ex.fe_group,
- &ac->ac_f_ex.fe_start);
+ &ac->ac_g_ex.fe_group,
+ &ac->ac_g_ex.fe_start);
ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
}
@@ -4247,15 +4421,14 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
if (ac->ac_f_ex.fe_len == 0)
return;
err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
- if (err) {
+ if (WARN_RATELIMIT(err,
+ "ext4: mb_load_buddy failed (%d)", err))
/*
* This should never happen since we pin the
* pages in the ext4_allocation_context so
* ext4_mb_load_buddy() should never fail.
*/
- WARN(1, "mb_load_buddy failed (%d)", err);
return;
- }
ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
ac->ac_f_ex.fe_len);
@@ -4263,8 +4436,11 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
ext4_mb_unload_buddy(&e4b);
return;
}
- if (pa->pa_type == MB_INODE_PA)
+ if (pa->pa_type == MB_INODE_PA) {
+ spin_lock(&pa->pa_lock);
pa->pa_free += ac->ac_b_ex.fe_len;
+ spin_unlock(&pa->pa_lock);
+ }
}
/*
@@ -4292,6 +4468,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
BUG_ON(start < pa->pa_pstart);
BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
BUG_ON(pa->pa_free < len);
+ BUG_ON(ac->ac_b_ex.fe_len <= 0);
pa->pa_free -= len;
mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa);
@@ -4312,14 +4489,14 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
ac->ac_status = AC_STATUS_FOUND;
ac->ac_pa = pa;
- /* we don't correct pa_pstart or pa_plen here to avoid
+ /* we don't correct pa_pstart or pa_len here to avoid
* possible race when the group is being loaded concurrently
* instead we correct pa later, after blocks are marked
* in on-disk bitmap -- see ext4_mb_release_context()
* Other CPUs are prevented from allocating from this pa by lg_mutex
*/
mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n",
- pa->pa_lstart-len, len, pa);
+ pa->pa_lstart, len, pa);
}
/*
@@ -4361,7 +4538,9 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
int order, i;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_locality_group *lg;
- struct ext4_prealloc_space *pa, *cpa = NULL;
+ struct ext4_prealloc_space *tmp_pa, *cpa = NULL;
+ ext4_lblk_t tmp_pa_start, tmp_pa_end;
+ struct rb_node *iter;
ext4_fsblk_t goal_block;
/* only data can be preallocated */
@@ -4369,35 +4548,47 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
return false;
/* first, try per-file preallocation */
- rcu_read_lock();
- list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
+ read_lock(&ei->i_prealloc_lock);
+ for (iter = ei->i_prealloc_node.rb_node; iter;
+ iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical,
+ tmp_pa_start, iter)) {
+ tmp_pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
/* all fields in this condition don't change,
* so we can skip locking for them */
- if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
- ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
- EXT4_C2B(sbi, pa->pa_len)))
+ tmp_pa_start = tmp_pa->pa_lstart;
+ tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len);
+
+ /* original request start doesn't lie in this PA */
+ if (ac->ac_o_ex.fe_logical < tmp_pa_start ||
+ ac->ac_o_ex.fe_logical >= tmp_pa_end)
continue;
/* non-extent files can't have physical blocks past 2^32 */
if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
- (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
- EXT4_MAX_BLOCK_FILE_PHYS))
- continue;
+ (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) >
+ EXT4_MAX_BLOCK_FILE_PHYS)) {
+ /*
+ * Since PAs don't overlap, we won't find any
+ * other PA to satisfy this.
+ */
+ break;
+ }
/* found preallocated blocks, use them */
- spin_lock(&pa->pa_lock);
- if (pa->pa_deleted == 0 && pa->pa_free) {
- atomic_inc(&pa->pa_count);
- ext4_mb_use_inode_pa(ac, pa);
- spin_unlock(&pa->pa_lock);
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free) {
+ atomic_inc(&tmp_pa->pa_count);
+ ext4_mb_use_inode_pa(ac, tmp_pa);
+ spin_unlock(&tmp_pa->pa_lock);
ac->ac_criteria = 10;
- rcu_read_unlock();
+ read_unlock(&ei->i_prealloc_lock);
return true;
}
- spin_unlock(&pa->pa_lock);
+ spin_unlock(&tmp_pa->pa_lock);
}
- rcu_read_unlock();
+ read_unlock(&ei->i_prealloc_lock);
/* can we use group allocation? */
if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
@@ -4419,16 +4610,16 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
*/
for (i = order; i < PREALLOC_TB_SIZE; i++) {
rcu_read_lock();
- list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
- pa_inode_list) {
- spin_lock(&pa->pa_lock);
- if (pa->pa_deleted == 0 &&
- pa->pa_free >= ac->ac_o_ex.fe_len) {
+ list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i],
+ pa_node.lg_list) {
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted == 0 &&
+ tmp_pa->pa_free >= ac->ac_o_ex.fe_len) {
cpa = ext4_mb_check_group_pa(goal_block,
- pa, cpa);
+ tmp_pa, cpa);
}
- spin_unlock(&pa->pa_lock);
+ spin_unlock(&tmp_pa->pa_lock);
}
rcu_read_unlock();
}
@@ -4454,6 +4645,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
struct ext4_free_data *entry;
grp = ext4_get_group_info(sb, group);
+ if (!grp)
+ return;
n = rb_first(&(grp->bb_free_root));
while (n) {
@@ -4481,6 +4674,9 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
int preallocated = 0;
int len;
+ if (!grp)
+ return;
+
/* all form of preallocation discards first load group,
* so the only competing code is preallocation use.
* we don't need any locking here
@@ -4525,16 +4721,22 @@ static void ext4_mb_mark_pa_deleted(struct super_block *sb,
}
}
-static void ext4_mb_pa_callback(struct rcu_head *head)
+static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa)
{
- struct ext4_prealloc_space *pa;
- pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
-
+ BUG_ON(!pa);
BUG_ON(atomic_read(&pa->pa_count));
BUG_ON(pa->pa_deleted == 0);
kmem_cache_free(ext4_pspace_cachep, pa);
}
+static void ext4_mb_pa_callback(struct rcu_head *head)
+{
+ struct ext4_prealloc_space *pa;
+
+ pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+ ext4_mb_pa_free(pa);
+}
+
/*
* drops a reference to preallocated space descriptor
* if this was the last reference and the space is consumed
@@ -4544,6 +4746,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
{
ext4_group_t grp;
ext4_fsblk_t grp_blk;
+ struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
/* in this short window concurrent discard can set pa_deleted */
spin_lock(&pa->pa_lock);
@@ -4588,11 +4791,42 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
list_del(&pa->pa_group_list);
ext4_unlock_group(sb, grp);
- spin_lock(pa->pa_obj_lock);
- list_del_rcu(&pa->pa_inode_list);
- spin_unlock(pa->pa_obj_lock);
+ if (pa->pa_type == MB_INODE_PA) {
+ write_lock(pa->pa_node_lock.inode_lock);
+ rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
+ write_unlock(pa->pa_node_lock.inode_lock);
+ ext4_mb_pa_free(pa);
+ } else {
+ spin_lock(pa->pa_node_lock.lg_lock);
+ list_del_rcu(&pa->pa_node.lg_list);
+ spin_unlock(pa->pa_node_lock.lg_lock);
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ }
+}
+
+static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new)
+{
+ struct rb_node **iter = &root->rb_node, *parent = NULL;
+ struct ext4_prealloc_space *iter_pa, *new_pa;
+ ext4_lblk_t iter_start, new_start;
+
+ while (*iter) {
+ iter_pa = rb_entry(*iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ new_pa = rb_entry(new, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ iter_start = iter_pa->pa_lstart;
+ new_start = new_pa->pa_lstart;
+
+ parent = *iter;
+ if (new_start < iter_start)
+ iter = &((*iter)->rb_left);
+ else
+ iter = &((*iter)->rb_right);
+ }
- call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ rb_link_node(new, parent, iter);
+ rb_insert_color(new, root);
}
/*
@@ -4616,10 +4850,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa = ac->ac_pa;
if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
- int winl;
- int wins;
- int win;
- int offs;
+ int new_bex_start;
+ int new_bex_end;
/* we can't allocate as much as normalizer wants.
* so, found space must get proper lstart
@@ -4627,38 +4859,47 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
- /* we're limited by original request in that
- * logical block must be covered any way
- * winl is window we can move our chunk within */
- winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
+ /*
+ * Use the below logic for adjusting best extent as it keeps
+ * fragmentation in check while ensuring logical range of best
+ * extent doesn't overflow out of goal extent:
+ *
+ * 1. Check if best ex can be kept at end of goal and still
+ * cover original start
+ * 2. Else, check if best ex can be kept at start of goal and
+ * still cover original start
+ * 3. Else, keep the best ex at start of original request.
+ */
+ new_bex_end = ac->ac_g_ex.fe_logical +
+ EXT4_C2B(sbi, ac->ac_g_ex.fe_len);
+ new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+ if (ac->ac_o_ex.fe_logical >= new_bex_start)
+ goto adjust_bex;
- /* also, we should cover whole original request */
- wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
+ new_bex_start = ac->ac_g_ex.fe_logical;
+ new_bex_end =
+ new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
+ if (ac->ac_o_ex.fe_logical < new_bex_end)
+ goto adjust_bex;
- /* the smallest one defines real window */
- win = min(winl, wins);
+ new_bex_start = ac->ac_o_ex.fe_logical;
+ new_bex_end =
+ new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
- offs = ac->ac_o_ex.fe_logical %
- EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
- if (offs && offs < win)
- win = offs;
+adjust_bex:
+ ac->ac_b_ex.fe_logical = new_bex_start;
- ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
- EXT4_NUM_B2C(sbi, win);
BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
+ BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical +
+ EXT4_C2B(sbi, ac->ac_g_ex.fe_len)));
}
- /* preallocation can change ac_b_ex, thus we store actually
- * allocated blocks for history */
- ac->ac_f_ex = ac->ac_b_ex;
-
pa->pa_lstart = ac->ac_b_ex.fe_logical;
pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
pa->pa_len = ac->ac_b_ex.fe_len;
pa->pa_free = pa->pa_len;
spin_lock_init(&pa->pa_lock);
- INIT_LIST_HEAD(&pa->pa_inode_list);
INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
pa->pa_type = MB_INODE_PA;
@@ -4667,20 +4908,22 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa->pa_len, pa->pa_lstart);
trace_ext4_mb_new_inode_pa(ac, pa);
- ext4_mb_use_inode_pa(ac, pa);
atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
+ ext4_mb_use_inode_pa(ac, pa);
ei = EXT4_I(ac->ac_inode);
grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+ if (!grp)
+ return;
- pa->pa_obj_lock = &ei->i_prealloc_lock;
+ pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock;
pa->pa_inode = ac->ac_inode;
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
- spin_lock(pa->pa_obj_lock);
- list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
- spin_unlock(pa->pa_obj_lock);
+ write_lock(pa->pa_node_lock.inode_lock);
+ ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node);
+ write_unlock(pa->pa_node_lock.inode_lock);
atomic_inc(&ei->i_prealloc_active);
}
@@ -4703,16 +4946,12 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa = ac->ac_pa;
- /* preallocation can change ac_b_ex, thus we store actually
- * allocated blocks for history */
- ac->ac_f_ex = ac->ac_b_ex;
-
pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
pa->pa_lstart = pa->pa_pstart;
pa->pa_len = ac->ac_b_ex.fe_len;
pa->pa_free = pa->pa_len;
spin_lock_init(&pa->pa_lock);
- INIT_LIST_HEAD(&pa->pa_inode_list);
+ INIT_LIST_HEAD(&pa->pa_node.lg_list);
INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
pa->pa_type = MB_GROUP_PA;
@@ -4725,10 +4964,12 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
+ if (!grp)
+ return;
lg = ac->ac_lg;
BUG_ON(lg == NULL);
- pa->pa_obj_lock = &lg->lg_prealloc_lock;
+ pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock;
pa->pa_inode = NULL;
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
@@ -4820,7 +5061,11 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
trace_ext4_mb_release_group_pa(sb, pa);
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
- BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
+ if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) {
+ ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu",
+ e4b->bd_group, group, pa->pa_pstart);
+ return 0;
+ }
mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
@@ -4846,9 +5091,12 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
struct ext4_prealloc_space *pa, *tmp;
struct list_head list;
struct ext4_buddy e4b;
+ struct ext4_inode_info *ei;
int err;
int free = 0;
+ if (!grp)
+ return 0;
mb_debug(sb, "discard preallocation for group %u\n", group);
if (list_empty(&grp->bb_prealloc_list))
goto out_dbg;
@@ -4904,17 +5152,26 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
/* remove from object (inode or locality group) */
- spin_lock(pa->pa_obj_lock);
- list_del_rcu(&pa->pa_inode_list);
- spin_unlock(pa->pa_obj_lock);
+ if (pa->pa_type == MB_GROUP_PA) {
+ spin_lock(pa->pa_node_lock.lg_lock);
+ list_del_rcu(&pa->pa_node.lg_list);
+ spin_unlock(pa->pa_node_lock.lg_lock);
+ } else {
+ write_lock(pa->pa_node_lock.inode_lock);
+ ei = EXT4_I(pa->pa_inode);
+ rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
+ write_unlock(pa->pa_node_lock.inode_lock);
+ }
+
+ list_del(&pa->u.pa_tmp_list);
- if (pa->pa_type == MB_GROUP_PA)
+ if (pa->pa_type == MB_GROUP_PA) {
ext4_mb_release_group_pa(&e4b, pa);
- else
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ } else {
ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
-
- list_del(&pa->u.pa_tmp_list);
- call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ ext4_mb_pa_free(pa);
+ }
}
ext4_unlock_group(sb, group);
@@ -4944,10 +5201,10 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
ext4_group_t group = 0;
struct list_head list;
struct ext4_buddy e4b;
+ struct rb_node *iter;
int err;
if (!S_ISREG(inode->i_mode)) {
- /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
return;
}
@@ -4966,17 +5223,19 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
repeat:
/* first, collect all pa's in the inode */
- spin_lock(&ei->i_prealloc_lock);
- while (!list_empty(&ei->i_prealloc_list) && needed) {
- pa = list_entry(ei->i_prealloc_list.prev,
- struct ext4_prealloc_space, pa_inode_list);
- BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
+ write_lock(&ei->i_prealloc_lock);
+ for (iter = rb_first(&ei->i_prealloc_node); iter && needed;
+ iter = rb_next(iter)) {
+ pa = rb_entry(iter, struct ext4_prealloc_space,
+ pa_node.inode_node);
+ BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock);
+
spin_lock(&pa->pa_lock);
if (atomic_read(&pa->pa_count)) {
/* this shouldn't happen often - nobody should
* use preallocation while we're discarding it */
spin_unlock(&pa->pa_lock);
- spin_unlock(&ei->i_prealloc_lock);
+ write_unlock(&ei->i_prealloc_lock);
ext4_msg(sb, KERN_ERR,
"uh-oh! used pa while discarding");
WARN_ON(1);
@@ -4987,7 +5246,7 @@ repeat:
if (pa->pa_deleted == 0) {
ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
- list_del_rcu(&pa->pa_inode_list);
+ rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
list_add(&pa->u.pa_tmp_list, &list);
needed--;
continue;
@@ -4995,7 +5254,7 @@ repeat:
/* someone is deleting pa right now */
spin_unlock(&pa->pa_lock);
- spin_unlock(&ei->i_prealloc_lock);
+ write_unlock(&ei->i_prealloc_lock);
/* we have to wait here because pa_deleted
* doesn't mean pa is already unlinked from
@@ -5012,7 +5271,7 @@ repeat:
schedule_timeout_uninterruptible(HZ);
goto repeat;
}
- spin_unlock(&ei->i_prealloc_lock);
+ write_unlock(&ei->i_prealloc_lock);
list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
BUG_ON(pa->pa_type != MB_INODE_PA);
@@ -5044,7 +5303,7 @@ repeat:
put_bh(bitmap_bh);
list_del(&pa->u.pa_tmp_list);
- call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ ext4_mb_pa_free(pa);
}
}
@@ -5061,14 +5320,20 @@ static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
return 0;
}
-static void ext4_mb_pa_free(struct ext4_allocation_context *ac)
+static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac)
{
struct ext4_prealloc_space *pa = ac->ac_pa;
BUG_ON(!pa);
ac->ac_pa = NULL;
WARN_ON(!atomic_dec_and_test(&pa->pa_count));
- kmem_cache_free(ext4_pspace_cachep, pa);
+ /*
+ * current function is only called due to an error or due to
+ * len of found blocks < len of requested blocks hence the PA has not
+ * been added to grp->bb_prealloc_list. So we don't need to lock it
+ */
+ pa->pa_deleted = 1;
+ ext4_mb_pa_free(pa);
}
#ifdef CONFIG_EXT4_DEBUG
@@ -5086,6 +5351,9 @@ static inline void ext4_mb_show_pa(struct super_block *sb)
struct ext4_prealloc_space *pa;
ext4_grpblk_t start;
struct list_head *cur;
+
+ if (!grp)
+ continue;
ext4_lock_group(sb, i);
list_for_each(cur, &grp->bb_prealloc_list) {
pa = list_entry(cur, struct ext4_prealloc_space,
@@ -5271,7 +5539,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
- pa_inode_list,
+ pa_node.lg_list,
lockdep_is_held(&lg->lg_prealloc_lock)) {
spin_lock(&pa->pa_lock);
if (atomic_read(&pa->pa_count)) {
@@ -5294,7 +5562,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
ext4_mb_mark_pa_deleted(sb, pa);
spin_unlock(&pa->pa_lock);
- list_del_rcu(&pa->pa_inode_list);
+ list_del_rcu(&pa->pa_node.lg_list);
list_add(&pa->u.pa_tmp_list, &discard_list);
total_entries--;
@@ -5355,7 +5623,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
/* Add the prealloc space to lg */
spin_lock(&lg->lg_prealloc_lock);
list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
- pa_inode_list,
+ pa_node.lg_list,
lockdep_is_held(&lg->lg_prealloc_lock)) {
spin_lock(&tmp_pa->pa_lock);
if (tmp_pa->pa_deleted) {
@@ -5364,8 +5632,8 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
}
if (!added && pa->pa_free < tmp_pa->pa_free) {
/* Add to the tail of the previous entry */
- list_add_tail_rcu(&pa->pa_inode_list,
- &tmp_pa->pa_inode_list);
+ list_add_tail_rcu(&pa->pa_node.lg_list,
+ &tmp_pa->pa_node.lg_list);
added = 1;
/*
* we want to count the total
@@ -5376,7 +5644,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
lg_prealloc_count++;
}
if (!added)
- list_add_tail_rcu(&pa->pa_inode_list,
+ list_add_tail_rcu(&pa->pa_node.lg_list,
&lg->lg_prealloc_list[order]);
spin_unlock(&lg->lg_prealloc_lock);
@@ -5390,29 +5658,10 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
}
/*
- * if per-inode prealloc list is too long, trim some PA
- */
-static void ext4_mb_trim_inode_pa(struct inode *inode)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- int count, delta;
-
- count = atomic_read(&ei->i_prealloc_active);
- delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1;
- if (count > sbi->s_mb_max_inode_prealloc + delta) {
- count -= sbi->s_mb_max_inode_prealloc;
- ext4_discard_preallocations(inode, count);
- }
-}
-
-/*
* release all resource we used in allocation
*/
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
- struct inode *inode = ac->ac_inode;
- struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *pa = ac->ac_pa;
if (pa) {
@@ -5432,23 +5681,13 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
* doesn't grow big.
*/
if (likely(pa->pa_free)) {
- spin_lock(pa->pa_obj_lock);
- list_del_rcu(&pa->pa_inode_list);
- spin_unlock(pa->pa_obj_lock);
+ spin_lock(pa->pa_node_lock.lg_lock);
+ list_del_rcu(&pa->pa_node.lg_list);
+ spin_unlock(pa->pa_node_lock.lg_lock);
ext4_mb_add_n_trim(ac);
}
}
- if (pa->pa_type == MB_INODE_PA) {
- /*
- * treat per-inode prealloc list as a lru list, then try
- * to trim the least recently used PA.
- */
- spin_lock(pa->pa_obj_lock);
- list_move(&pa->pa_inode_list, &ei->i_prealloc_list);
- spin_unlock(pa->pa_obj_lock);
- }
-
ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
if (ac->ac_bitmap_page)
@@ -5458,7 +5697,6 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
mutex_unlock(&ac->ac_lg->lg_mutex);
ext4_mb_collect_stats(ac);
- ext4_mb_trim_inode_pa(inode);
return 0;
}
@@ -5611,13 +5849,13 @@ repeat:
* So we have to free this pa here itself.
*/
if (*errp) {
- ext4_mb_pa_free(ac);
+ ext4_mb_pa_put_free(ac);
ext4_discard_allocated_blocks(ac);
goto errout;
}
if (ac->ac_status == AC_STATUS_FOUND &&
ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
- ext4_mb_pa_free(ac);
+ ext4_mb_pa_put_free(ac);
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -5636,20 +5874,19 @@ repeat:
* If block allocation fails then the pa allocated above
* needs to be freed here itself.
*/
- ext4_mb_pa_free(ac);
+ ext4_mb_pa_put_free(ac);
*errp = -ENOSPC;
}
-errout:
if (*errp) {
+errout:
ac->ac_b_ex.fe_len = 0;
ar->len = 0;
ext4_mb_show_ac(ac);
}
ext4_mb_release_context(ac);
+ kmem_cache_free(ext4_ac_cachep, ac);
out:
- if (ac)
- kmem_cache_free(ext4_ac_cachep, ac);
if (inquota && ar->len < inquota)
dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
if (!ar->len) {
@@ -5693,7 +5930,7 @@ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
kmem_cache_free(ext4_free_data_cachep, entry);
}
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
struct ext4_free_data *new_entry)
{
@@ -5736,7 +5973,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
EXT4_C2B(sbi, cluster),
"Block already on to-be-freed list");
kmem_cache_free(ext4_free_data_cachep, new_entry);
- return 0;
+ return;
}
}
@@ -5762,7 +5999,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
sbi->s_mb_free_pending += clusters;
spin_unlock(&sbi->s_md_lock);
- return 0;
}
/*
@@ -5797,9 +6033,6 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
return 0;
}
- ext4_get_group_no_and_offset(sb,
- max(ext4_group_first_block_no(sb, group), goal),
- NULL, &blkoff);
while (1) {
i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
blkoff);
@@ -5814,6 +6047,8 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
brelse(bitmap_bh);
if (i < max)
break;
+
+ blkoff = 0;
}
if (group >= ext4_get_groups_count(sb) || i >= max) {
@@ -5842,13 +6077,12 @@ static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
bitmap_bh = ext4_read_block_bitmap(sb, group);
if (IS_ERR(bitmap_bh)) {
- err = PTR_ERR(bitmap_bh);
pr_warn("Failed to read block bitmap\n");
return;
}
gdp = ext4_get_group_desc(sb, group, &gdp_bh);
if (!gdp)
- return;
+ goto err_out;
for (i = 0; i < count; i++) {
if (!mb_test_bit(blkoff + i, bitmap_bh->b_data))
@@ -5857,15 +6091,17 @@ static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
mb_clear_bits(bitmap_bh->b_data, blkoff, count);
err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
if (err)
- return;
+ goto err_out;
ext4_free_group_clusters_set(
sb, gdp, ext4_free_group_clusters(sb, gdp) +
count - already_freed);
- ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
+ ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
ext4_group_desc_csum_set(sb, group, gdp);
ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
sync_dirty_buffer(bitmap_bh);
sync_dirty_buffer(gdp_bh);
+
+err_out:
brelse(bitmap_bh);
}
@@ -5885,6 +6121,7 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode,
struct buffer_head *bitmap_bh = NULL;
struct super_block *sb = inode->i_sb;
struct ext4_group_desc *gdp;
+ struct ext4_group_info *grp;
unsigned int overflow;
ext4_grpblk_t bit;
struct buffer_head *gd_bh;
@@ -5910,8 +6147,8 @@ do_more:
overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
- ext4_get_group_info(sb, block_group))))
+ grp = ext4_get_group_info(sb, block_group);
+ if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
return;
/*
@@ -6023,7 +6260,7 @@ do_more:
ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
ext4_free_group_clusters_set(sb, gdp, ret);
- ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
+ ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
@@ -6280,7 +6517,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
free_clusters_count = clusters_freed +
ext4_free_group_clusters(sb, desc);
ext4_free_group_clusters_set(sb, desc, free_clusters_count);
- ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
+ ext4_block_bitmap_csum_set(sb, desc, bitmap_bh);
ext4_group_desc_csum_set(sb, block_group, desc);
ext4_unlock_group(sb, block_group);
percpu_counter_add(&sbi->s_freeclusters_counter,
@@ -6513,6 +6750,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
for (group = first_group; group <= last_group; group++) {
grp = ext4_get_group_info(sb, group);
+ if (!grp)
+ continue;
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
ret = ext4_mb_init_group(sb, group, GFP_NOFS);