diff options
-rw-r--r-- | fs/btrfs/send.c | 85 |
1 files changed, 82 insertions, 3 deletions
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 55275ba90cb4..5a05beabf0c3 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -137,6 +137,8 @@ struct send_ctx { */ struct inode *cur_inode; struct file_ra_state ra; + u64 page_cache_clear_start; + bool clean_page_cache; /* * We process inodes by their increasing order, so if before an @@ -5139,6 +5141,7 @@ static int send_extent_data(struct send_ctx *sctx, const u64 offset, const u64 len) { + const u64 end = offset + len; u64 read_size = max_send_read_size(sctx); u64 sent = 0; @@ -5157,6 +5160,28 @@ static int send_extent_data(struct send_ctx *sctx, } memset(&sctx->ra, 0, sizeof(struct file_ra_state)); file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping); + + /* + * It's very likely there are no pages from this inode in the page + * cache, so after reading extents and sending their data, we clean + * the page cache to avoid trashing the page cache (adding pressure + * to the page cache and forcing eviction of other data more useful + * for applications). + * + * We decide if we should clean the page cache simply by checking + * if the inode's mapping nrpages is 0 when we first open it, and + * not by using something like filemap_range_has_page() before + * reading an extent because when we ask the readahead code to + * read a given file range, it may (and almost always does) read + * pages from beyond that range (see the documentation for + * page_cache_sync_readahead()), so it would not be reliable, + * because after reading the first extent future calls to + * filemap_range_has_page() would return true because the readahead + * on the previous extent resulted in reading pages of the current + * extent as well. + */ + sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0); + sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE); } while (sent < len) { @@ -5168,6 +5193,37 @@ static int send_extent_data(struct send_ctx *sctx, return ret; sent += size; } + + if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) { + /* + * Always operate only on ranges that are a multiple of the page + * size. This is not only to prevent zeroing parts of a page in + * the case of subpage sector size, but also to guarantee we evict + * pages, as passing a range that is smaller than page size does + * not evict the respective page (only zeroes part of its content). + * + * Always start from the end offset of the last range cleared. + * This is because the readahead code may (and very often does) + * reads pages beyond the range we request for readahead. So if + * we have an extent layout like this: + * + * [ extent A ] [ extent B ] [ extent C ] + * + * When we ask page_cache_sync_readahead() to read extent A, it + * may also trigger reads for pages of extent B. If we are doing + * an incremental send and extent B has not changed between the + * parent and send snapshots, some or all of its pages may end + * up being read and placed in the page cache. So when truncating + * the page cache we always start from the end offset of the + * previously processed extent up to the end of the current + * extent. + */ + truncate_inode_pages_range(&sctx->cur_inode->i_data, + sctx->page_cache_clear_start, + end - 1); + sctx->page_cache_clear_start = end; + } + return 0; } @@ -6172,6 +6228,30 @@ out: return ret; } +static void close_current_inode(struct send_ctx *sctx) +{ + u64 i_size; + + if (sctx->cur_inode == NULL) + return; + + i_size = i_size_read(sctx->cur_inode); + + /* + * If we are doing an incremental send, we may have extents between the + * last processed extent and the i_size that have not been processed + * because they haven't changed but we may have read some of their pages + * through readahead, see the comments at send_extent_data(). + */ + if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size) + truncate_inode_pages_range(&sctx->cur_inode->i_data, + sctx->page_cache_clear_start, + round_up(i_size, PAGE_SIZE) - 1); + + iput(sctx->cur_inode); + sctx->cur_inode = NULL; +} + static int changed_inode(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { @@ -6182,8 +6262,7 @@ static int changed_inode(struct send_ctx *sctx, u64 left_gen = 0; u64 right_gen = 0; - iput(sctx->cur_inode); - sctx->cur_inode = NULL; + close_current_inode(sctx); sctx->cur_ino = key->objectid; sctx->cur_inode_new_gen = 0; @@ -7671,7 +7750,7 @@ out: name_cache_free(sctx); - iput(sctx->cur_inode); + close_current_inode(sctx); kfree(sctx); } |