diff options
author | Filipe Manana <fdmanana@suse.com> | 2023-05-29 16:16:57 +0100 |
---|---|---|
committer | David Sterba <dsterba@suse.com> | 2023-06-19 13:59:31 +0200 |
commit | 315dd5cc75ddccb6ebd863366b7a7d0488057637 (patch) | |
tree | 839efc79e3471de037ee252f14b1e4f5ca633f54 /fs/btrfs/delayed-ref.h | |
parent | 31dd8c81ddfa142dab53a0837eed88e8febe7b1e (diff) |
btrfs: reorder some members of struct btrfs_delayed_ref_head
Currently struct delayed_ref_head has its 'bytenr' and 'href_node' members
in different cache lines (even on a release, non-debug, kernel). This is
not optimal because when iterating the red black tree of delayed ref heads
for inserting a new delayed ref head (htree_insert()) we have to pull in 2
cache lines of delayed ref heads we find in a patch, one for the tree node
(struct rb_node) and another one for the 'bytenr' field. The same applies
when searching for an existing delayed ref head (find_ref_head()).
On a release (non-debug) kernel, the structure also has two 4 bytes holes,
which makes it 8 bytes longer than necessary. Its current layout is the
following:
struct btrfs_delayed_ref_head {
u64 bytenr; /* 0 8 */
u64 num_bytes; /* 8 8 */
refcount_t refs; /* 16 4 */
/* XXX 4 bytes hole, try to pack */
struct mutex mutex; /* 24 32 */
spinlock_t lock; /* 56 4 */
/* XXX 4 bytes hole, try to pack */
/* --- cacheline 1 boundary (64 bytes) --- */
struct rb_root_cached ref_tree; /* 64 16 */
struct list_head ref_add_list; /* 80 16 */
struct rb_node href_node __attribute__((__aligned__(8))); /* 96 24 */
struct btrfs_delayed_extent_op * extent_op; /* 120 8 */
/* --- cacheline 2 boundary (128 bytes) --- */
int total_ref_mod; /* 128 4 */
int ref_mod; /* 132 4 */
unsigned int must_insert_reserved:1; /* 136: 0 4 */
unsigned int is_data:1; /* 136: 1 4 */
unsigned int is_system:1; /* 136: 2 4 */
unsigned int processing:1; /* 136: 3 4 */
/* size: 144, cachelines: 3, members: 15 */
/* sum members: 128, holes: 2, sum holes: 8 */
/* sum bitfield members: 4 bits (0 bytes) */
/* padding: 4 */
/* bit_padding: 28 bits */
/* forced alignments: 1 */
/* last cacheline: 16 bytes */
} __attribute__((__aligned__(8)));
This change reorders the 'href_node' and 'refs' members so that we have
the 'href_node' in the same cache line as the 'bytenr' field, while also
eliminating the two holes and reducing the structure size from 144 bytes
down to 136 bytes, so we can now have 30 ref heads per 4K page (on x86_64)
instead of 28. The new structure layout after this change is now:
struct btrfs_delayed_ref_head {
u64 bytenr; /* 0 8 */
u64 num_bytes; /* 8 8 */
struct rb_node href_node __attribute__((__aligned__(8))); /* 16 24 */
struct mutex mutex; /* 40 32 */
/* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */
refcount_t refs; /* 72 4 */
spinlock_t lock; /* 76 4 */
struct rb_root_cached ref_tree; /* 80 16 */
struct list_head ref_add_list; /* 96 16 */
struct btrfs_delayed_extent_op * extent_op; /* 112 8 */
int total_ref_mod; /* 120 4 */
int ref_mod; /* 124 4 */
/* --- cacheline 2 boundary (128 bytes) --- */
unsigned int must_insert_reserved:1; /* 128: 0 4 */
unsigned int is_data:1; /* 128: 1 4 */
unsigned int is_system:1; /* 128: 2 4 */
unsigned int processing:1; /* 128: 3 4 */
/* size: 136, cachelines: 3, members: 15 */
/* padding: 4 */
/* bit_padding: 28 bits */
/* forced alignments: 1 */
/* last cacheline: 8 bytes */
} __attribute__((__aligned__(8)));
Running the following fs_mark test shows some significant improvement.
$ cat test.sh
#!/bin/bash
# 15G null block device
DEV=/dev/nullb0
MNT=/mnt/nullb0
FILES=100000
THREADS=$(nproc --all)
FILE_SIZE=0
echo "performance" | \
tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
mkfs.btrfs -f $DEV
mount -o ssd $DEV $MNT
OPTS="-S 0 -L 5 -n $FILES -s $FILE_SIZE -t $THREADS -k"
for ((i = 1; i <= $THREADS; i++)); do
OPTS="$OPTS -d $MNT/d$i"
done
fs_mark $OPTS
umount $MNT
Before this change:
FSUse% Count Size Files/sec App Overhead
10 1200000 0 112631.3 11928055
16 2400000 0 189943.8 12140777
23 3600000 0 150719.2 13178480
50 4800000 0 99137.3 12504293
53 6000000 0 111733.9 12670836
Total files/sec: 664165.5
After this change:
FSUse% Count Size Files/sec App Overhead
10 1200000 0 148589.5 11565889
16 2400000 0 227743.8 11561596
23 3600000 0 191590.5 12550755
30 4800000 0 179812.3 12629610
53 6000000 0 92471.4 12352383
Total files/sec: 840207.5
Measuring the execution times of htree_insert(), in nanoseconds, during
those fs_mark runs:
Before this change:
Range: 0.000 - 940647.000; Mean: 619.733; Median: 548.000; Stddev: 1834.231
Percentiles: 90th: 980.000; 95th: 1208.000; 99th: 2090.000
0.000 - 6.384: 257 |
6.384 - 26.259: 977 |
26.259 - 99.635: 4963 |
99.635 - 370.526: 136800 #############
370.526 - 1370.603: 566110 #####################################################
1370.603 - 5062.704: 24945 ##
5062.704 - 18693.248: 944 |
18693.248 - 69014.670: 211 |
69014.670 - 254791.959: 30 |
254791.959 - 940647.000: 4 |
After this change:
Range: 0.000 - 299200.000; Mean: 587.754; Median: 542.000; Stddev: 1030.422
Percentiles: 90th: 918.000; 95th: 1113.000; 99th: 1987.000
0.000 - 5.585: 163 |
5.585 - 20.678: 452 |
20.678 - 70.369: 1806 |
70.369 - 233.965: 26268 ####
233.965 - 772.564: 333519 #####################################################
772.564 - 2545.771: 91820 ###############
2545.771 - 8383.615: 2238 |
8383.615 - 27603.280: 170 |
27603.280 - 90879.297: 68 |
90879.297 - 299200.000: 12 |
Mean, percentiles, maximum times are all better, as well as a lower
standard deviation.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
Diffstat (limited to 'fs/btrfs/delayed-ref.h')
-rw-r--r-- | fs/btrfs/delayed-ref.h | 12 |
1 files changed, 9 insertions, 3 deletions
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index b54261fe509b..9b28e800a604 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -70,20 +70,26 @@ struct btrfs_delayed_extent_op { struct btrfs_delayed_ref_head { u64 bytenr; u64 num_bytes; - refcount_t refs; + /* + * For insertion into struct btrfs_delayed_ref_root::href_root. + * Keep it in the same cache line as 'bytenr' for more efficient + * searches in the rbtree. + */ + struct rb_node href_node; /* * the mutex is held while running the refs, and it is also * held when checking the sum of reference modifications. */ struct mutex mutex; + refcount_t refs; + + /* Protects 'ref_tree' and 'ref_add_list'. */ spinlock_t lock; struct rb_root_cached ref_tree; /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */ struct list_head ref_add_list; - struct rb_node href_node; - struct btrfs_delayed_extent_op *extent_op; /* |