1 files changed, 100 insertions, 31 deletions
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2171057a4477..dca6b12769ec 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -7,11 +7,33 @@
 #include <linux/refcount.h>
 #include <linux/fiemap.h>
 #include <linux/btrfs_tree.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <linux/rwsem.h>
+#include <linux/list.h>
+#include <linux/slab.h>
 #include "compression.h"
+#include "messages.h"
 #include "ulist.h"
 #include "misc.h"
 
+struct page;
+struct file;
+struct folio;
+struct inode;
+struct fiemap_extent_info;
+struct readahead_control;
+struct address_space;
+struct writeback_control;
+struct extent_io_tree;
+struct extent_map_tree;
+struct extent_state;
+struct btrfs_block_group;
+struct btrfs_fs_info;
+struct btrfs_inode;
+struct btrfs_root;
 struct btrfs_trans_handle;
+struct btrfs_tree_parent_check;
 
 enum {
 	EXTENT_BUFFER_UPTODATE,
@@ -28,7 +50,8 @@ enum {
 	EXTENT_BUFFER_IN_TREE,
 	/* write IO error */
 	EXTENT_BUFFER_WRITE_ERR,
-	EXTENT_BUFFER_NO_CHECK,
+	/* Indicate the extent buffer is written zeroed out (for zoned) */
+	EXTENT_BUFFER_ZONED_ZEROOUT,
 	/* Indicate that extent buffer pages a being read */
 	EXTENT_BUFFER_READING,
 };
@@ -43,10 +66,10 @@ enum {
 };
 
 /*
- * page->private values.  Every page that is controlled by the extent
- * map has page->private set to one.
+ * Folio private values.  Every page that is controlled by the extent map has
+ * folio private set to this value.
  */
-#define EXTENT_PAGE_PRIVATE 1
+#define EXTENT_FOLIO_PRIVATE			1
 
 /*
  * The extent buffer bitmap operations are done with byte granularity instead of
@@ -62,11 +85,6 @@ enum {
 #define BITMAP_LAST_BYTE_MASK(nbits) \
 	(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
 
-struct btrfs_root;
-struct btrfs_inode;
-struct btrfs_fs_info;
-struct extent_io_tree;
-struct btrfs_tree_parent_check;
 
 int __init extent_buffer_init_cachep(void);
 void __cold extent_buffer_free_cachep(void);
@@ -74,19 +92,33 @@ void __cold extent_buffer_free_cachep(void);
 #define INLINE_EXTENT_BUFFER_PAGES     (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE)
 struct extent_buffer {
 	u64 start;
-	unsigned long len;
+	u32 len;
+	u32 folio_size;
 	unsigned long bflags;
 	struct btrfs_fs_info *fs_info;
+
+	/*
+	 * The address where the eb can be accessed without any cross-page handling.
+	 * This can be NULL if not possible.
+	 */
+	void *addr;
+
 	spinlock_t refs_lock;
 	atomic_t refs;
 	int read_mirror;
 	/* >= 0 if eb belongs to a log tree, -1 otherwise */
 	s8 log_index;
+	u8 folio_shift;
 	struct rcu_head rcu_head;
 
 	struct rw_semaphore lock;
 
-	struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
+	/*
+	 * Pointers to all the folios of the extent buffer.
+	 *
+	 * For now the folio is always order 0 (aka, a single page).
+	 */
+	struct folio *folios[INLINE_EXTENT_BUFFER_PAGES];
 #ifdef CONFIG_BTRFS_DEBUG
 	struct list_head leak_list;
 	pid_t lock_owner;
@@ -100,6 +132,13 @@ struct btrfs_eb_write_context {
 	struct btrfs_block_group *zoned_bg;
 };
 
+static inline unsigned long offset_in_eb_folio(const struct extent_buffer *eb,
+					       u64 start)
+{
+	ASSERT(eb->folio_size);
+	return start & (eb->folio_size - 1);
+}
+
 /*
  * Get the correct offset inside the page of extent buffer.
  *
@@ -108,29 +147,43 @@ struct btrfs_eb_write_context {
  *
  * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases.
  */
-static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb,
-					   unsigned long offset)
+static inline size_t get_eb_offset_in_folio(const struct extent_buffer *eb,
+					    unsigned long offset)
 {
 	/*
-	 * For sectorsize == PAGE_SIZE case, eb->start will always be aligned
-	 * to PAGE_SIZE, thus adding it won't cause any difference.
+	 * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case
+	 *    1.1) One large folio covering the whole eb
+	 *	   The eb->start is aligned to folio size, thus adding it
+	 *	   won't cause any difference.
+	 *    1.2) Several page sized folios
+	 *	   The eb->start is aligned to folio (page) size, thus
+	 *	   adding it won't cause any difference.
 	 *
-	 * For sectorsize < PAGE_SIZE, we must only read the data that belongs
-	 * to the eb, thus we have to take the eb->start into consideration.
+	 * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case
+	 *    In this case there would only be one page sized folio, and there
+	 *    may be several different extent buffers in the page/folio.
+	 *    We need to add eb->start to properly access the offset inside
+	 *    that eb.
 	 */
-	return offset_in_page(offset + eb->start);
+	return offset_in_folio(eb->folios[0], offset + eb->start);
 }
 
-static inline unsigned long get_eb_page_index(unsigned long offset)
+static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb,
+					       unsigned long offset)
 {
 	/*
-	 * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough.
+	 * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case
+	 *    1.1) One large folio covering the whole eb.
+	 *	   the folio_shift would be large enough to always make us
+	 *	   return 0 as index.
+	 *    1.2) Several page sized folios
+	 *         The folio_shift would be PAGE_SHIFT, giving us the correct
+	 *         index.
 	 *
-	 * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE,
-	 * and have ensured that all tree blocks are contained in one page,
-	 * thus we always get index == 0.
+	 * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case
+	 *    The folio would only be page sized, and always give us 0 as index.
 	 */
-	return offset >> PAGE_SHIFT;
+	return offset >> eb->folio_shift;
 }
 
 /*
@@ -178,22 +231,20 @@ static inline void extent_changeset_free(struct extent_changeset *changeset)
 	kfree(changeset);
 }
 
-struct extent_map_tree;
-
-int try_release_extent_mapping(struct page *page, gfp_t mask);
+bool try_release_extent_mapping(struct page *page, gfp_t mask);
 int try_release_extent_buffer(struct page *page);
 
 int btrfs_read_folio(struct file *file, struct folio *folio);
 void extent_write_locked_range(struct inode *inode, struct page *locked_page,
 			       u64 start, u64 end, struct writeback_control *wbc,
 			       bool pages_dirty);
-int extent_writepages(struct address_space *mapping,
-		      struct writeback_control *wbc);
+int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc);
 int btree_write_cache_pages(struct address_space *mapping,
 			    struct writeback_control *wbc);
-void extent_readahead(struct readahead_control *rac);
+void btrfs_readahead(struct readahead_control *rac);
 int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
 		  u64 start, u64 len);
+int set_folio_extent_mapped(struct folio *folio);
 int set_page_extent_mapped(struct page *page);
 void clear_page_extent_mapped(struct page *page);
 
@@ -230,6 +281,20 @@ static inline int num_extent_pages(const struct extent_buffer *eb)
 	return (eb->len >> PAGE_SHIFT) ?: 1;
 }
 
+/*
+ * This can only be determined at runtime by checking eb::folios[0].
+ *
+ * As we can have either one large folio covering the whole eb
+ * (either nodesize <= PAGE_SIZE, or high order folio), or multiple
+ * single-paged folios.
+ */
+static inline int num_extent_folios(const struct extent_buffer *eb)
+{
+	if (folio_order(eb->folios[0]))
+		return 1;
+	return num_extent_pages(eb);
+}
+
 static inline int extent_buffer_uptodate(const struct extent_buffer *eb)
 {
 	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -288,13 +353,17 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb);
 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
 				  struct page *locked_page,
+				  struct extent_state **cached,
 				  u32 bits_to_clear, unsigned long page_ops);
 int extent_invalidate_folio(struct extent_io_tree *tree,
 			    struct folio *folio, size_t offset);
 void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
 			      struct extent_buffer *buf);
 
-int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
+			   gfp_t extra_gfp);
+int btrfs_alloc_folio_array(unsigned int nr_folios, struct folio **folio_array,
+			    gfp_t extra_gfp);
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 bool find_lock_delalloc_range(struct inode *inode,