From 637cd8c312d8caf234821fd37238b8f956d9ab13 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 31 Aug 2017 23:27:11 -0700 Subject: bpf: Add lru_hash_lookup performance test Create a new case to test the LRU lookup performance. At the beginning, the LRU map is fully loaded (i.e. the number of keys is equal to map->max_entries). The lookup is done through key 0 to num_map_entries and then repeats from 0 again. This patch also creates an anonymous struct to properly name the test params in stress_lru_hmap_alloc() in map_perf_test_kern.c. Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- samples/bpf/map_perf_test_kern.c | 44 +++++++++++++++++++---- samples/bpf/map_perf_test_user.c | 77 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 112 insertions(+), 9 deletions(-) diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c index ca3b22ed577a..098c857f1eda 100644 --- a/samples/bpf/map_perf_test_kern.c +++ b/samples/bpf/map_perf_test_kern.c @@ -88,6 +88,13 @@ struct bpf_map_def SEC("maps") array_map = { .max_entries = MAX_ENTRIES, }; +struct bpf_map_def SEC("maps") lru_hash_lookup_map = { + .type = BPF_MAP_TYPE_LRU_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(long), + .max_entries = MAX_ENTRIES, +}; + SEC("kprobe/sys_getuid") int stress_hmap(struct pt_regs *ctx) { @@ -148,12 +155,23 @@ int stress_percpu_hmap_alloc(struct pt_regs *ctx) SEC("kprobe/sys_connect") int stress_lru_hmap_alloc(struct pt_regs *ctx) { + char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%dn"; + union { + u16 dst6[8]; + struct { + u16 magic0; + u16 magic1; + u16 tcase; + u16 unused16; + u32 unused32; + u32 key; + }; + } test_params; struct sockaddr_in6 *in6; - u16 test_case, dst6[8]; + u16 test_case; int addrlen, ret; - char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%d\n"; long val = 1; - u32 key = bpf_get_prandom_u32(); + u32 key = 0; in6 = (struct sockaddr_in6 *)PT_REGS_PARM2(ctx); addrlen = (int)PT_REGS_PARM3(ctx); @@ -161,14 +179,18 @@ int stress_lru_hmap_alloc(struct pt_regs *ctx) if (addrlen != sizeof(*in6)) return 0; - ret = bpf_probe_read(dst6, sizeof(dst6), &in6->sin6_addr); + ret = bpf_probe_read(test_params.dst6, sizeof(test_params.dst6), + &in6->sin6_addr); if (ret) goto done; - if (dst6[0] != 0xdead || dst6[1] != 0xbeef) + if (test_params.magic0 != 0xdead || + test_params.magic1 != 0xbeef) return 0; - test_case = dst6[7]; + test_case = test_params.tcase; + if (test_case != 3) + key = bpf_get_prandom_u32(); if (test_case == 0) { ret = bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY); @@ -188,6 +210,16 @@ int stress_lru_hmap_alloc(struct pt_regs *ctx) ret = bpf_map_update_elem(nolocal_lru_map, &key, &val, BPF_ANY); + } else if (test_case == 3) { + u32 i; + + key = test_params.key; + +#pragma clang loop unroll(full) + for (i = 0; i < 32; i++) { + bpf_map_lookup_elem(&lru_hash_lookup_map, &key); + key++; + } } else { ret = -EINVAL; } diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index bccbf8478e43..f388254896f6 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -46,6 +46,7 @@ enum test_type { HASH_LOOKUP, ARRAY_LOOKUP, INNER_LRU_HASH_PREALLOC, + LRU_HASH_LOOKUP, NR_TESTS, }; @@ -60,6 +61,7 @@ const char *test_map_names[NR_TESTS] = { [HASH_LOOKUP] = "hash_map", [ARRAY_LOOKUP] = "array_map", [INNER_LRU_HASH_PREALLOC] = "inner_lru_hash_map", + [LRU_HASH_LOOKUP] = "lru_hash_lookup_map", }; static int test_flags = ~0; @@ -67,6 +69,8 @@ static uint32_t num_map_entries; static uint32_t inner_lru_hash_size; static int inner_lru_hash_idx = -1; static int array_of_lru_hashs_idx = -1; +static int lru_hash_lookup_idx = -1; +static int lru_hash_lookup_test_entries = 32; static uint32_t max_cnt = 1000000; static int check_test_flags(enum test_type t) @@ -86,6 +90,32 @@ static void test_hash_prealloc(int cpu) cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time)); } +static int pre_test_lru_hash_lookup(int tasks) +{ + int fd = map_fd[lru_hash_lookup_idx]; + uint32_t key; + long val = 1; + int ret; + + if (num_map_entries > lru_hash_lookup_test_entries) + lru_hash_lookup_test_entries = num_map_entries; + + /* Populate the lru_hash_map for LRU_HASH_LOOKUP perf test. + * + * It is fine that the user requests for a map with + * num_map_entries < 32 and some of the later lru hash lookup + * may return not found. For LRU map, we are not interested + * in such small map performance. + */ + for (key = 0; key < lru_hash_lookup_test_entries; key++) { + ret = bpf_map_update_elem(fd, &key, &val, BPF_NOEXIST); + if (ret) + return ret; + } + + return 0; +} + static void do_test_lru(enum test_type test, int cpu) { static int inner_lru_map_fds[MAX_NR_CPUS]; @@ -135,13 +165,17 @@ static void do_test_lru(enum test_type test, int cpu) if (test == LRU_HASH_PREALLOC) { test_name = "lru_hash_map_perf"; - in6.sin6_addr.s6_addr16[7] = 0; + in6.sin6_addr.s6_addr16[2] = 0; } else if (test == NOCOMMON_LRU_HASH_PREALLOC) { test_name = "nocommon_lru_hash_map_perf"; - in6.sin6_addr.s6_addr16[7] = 1; + in6.sin6_addr.s6_addr16[2] = 1; } else if (test == INNER_LRU_HASH_PREALLOC) { test_name = "inner_lru_hash_map_perf"; - in6.sin6_addr.s6_addr16[7] = 2; + in6.sin6_addr.s6_addr16[2] = 2; + } else if (test == LRU_HASH_LOOKUP) { + test_name = "lru_hash_lookup_perf"; + in6.sin6_addr.s6_addr16[2] = 3; + in6.sin6_addr.s6_addr32[3] = 0; } else { assert(0); } @@ -150,6 +184,11 @@ static void do_test_lru(enum test_type test, int cpu) for (i = 0; i < max_cnt; i++) { ret = connect(-1, (const struct sockaddr *)&in6, sizeof(in6)); assert(ret == -1 && errno == EBADF); + if (in6.sin6_addr.s6_addr32[3] < + lru_hash_lookup_test_entries - 32) + in6.sin6_addr.s6_addr32[3] += 32; + else + in6.sin6_addr.s6_addr32[3] = 0; } printf("%d:%s pre-alloc %lld events per sec\n", cpu, test_name, @@ -171,6 +210,11 @@ static void test_inner_lru_hash_prealloc(int cpu) do_test_lru(INNER_LRU_HASH_PREALLOC, cpu); } +static void test_lru_hash_lookup(int cpu) +{ + do_test_lru(LRU_HASH_LOOKUP, cpu); +} + static void test_percpu_hash_prealloc(int cpu) { __u64 start_time; @@ -243,6 +287,11 @@ static void test_array_lookup(int cpu) cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time)); } +typedef int (*pre_test_func)(int tasks); +const pre_test_func pre_test_funcs[] = { + [LRU_HASH_LOOKUP] = pre_test_lru_hash_lookup, +}; + typedef void (*test_func)(int cpu); const test_func test_funcs[] = { [HASH_PREALLOC] = test_hash_prealloc, @@ -255,8 +304,25 @@ const test_func test_funcs[] = { [HASH_LOOKUP] = test_hash_lookup, [ARRAY_LOOKUP] = test_array_lookup, [INNER_LRU_HASH_PREALLOC] = test_inner_lru_hash_prealloc, + [LRU_HASH_LOOKUP] = test_lru_hash_lookup, }; +static int pre_test(int tasks) +{ + int i; + + for (i = 0; i < NR_TESTS; i++) { + if (pre_test_funcs[i] && check_test_flags(i)) { + int ret = pre_test_funcs[i](tasks); + + if (ret) + return ret; + } + } + + return 0; +} + static void loop(int cpu) { cpu_set_t cpuset; @@ -277,6 +343,8 @@ static void run_perf_test(int tasks) pid_t pid[tasks]; int i; + assert(!pre_test(tasks)); + for (i = 0; i < tasks; i++) { pid[i] = fork(); if (pid[i] == 0) { @@ -344,6 +412,9 @@ static void fixup_map(struct bpf_map_data *map, int idx) array_of_lru_hashs_idx = idx; } + if (!strcmp("lru_hash_lookup_map", map->name)) + lru_hash_lookup_idx = idx; + if (num_map_entries <= 0) return; -- cgit v1.2.3-73-gaa49b From cc555421bc118edd070f41258d6f55f1ccfc2558 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 31 Aug 2017 23:27:12 -0700 Subject: bpf: Inline LRU map lookup Inline the lru map lookup to save the cost in making calls to bpf_map_lookup_elem() and htab_lru_map_lookup_elem(). Different LRU hash size is tested. The benefit diminishes when the cache miss starts to dominate in the bigger LRU hash. Considering the change is simple, it is still worth to optimize. First column: Size of the LRU hash Second column: Number of lookups/s Before: > for i in $(seq 9 20); do echo "$((2**i+1)): $(./map_perf_test 1024 1 $((2**i+1)) 10000000 | awk '{print $3}')"; done 513: 1132020 1025: 1056826 2049: 1007024 4097: 853298 8193: 742723 16385: 712600 32769: 688142 65537: 677028 131073: 619437 262145: 498770 524289: 316695 1048577: 260038 After: > for i in $(seq 9 20); do echo "$((2**i+1)): $(./map_perf_test 1024 1 $((2**i+1)) 10000000 | awk '{print $3}')"; done 513: 1221851 1025: 1144695 2049: 1049902 4097: 884460 8193: 773731 16385: 729673 32769: 721989 65537: 715530 131073: 671665 262145: 516987 524289: 321125 1048577: 260048 Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d246905f2bb1..682f4543fefa 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -514,6 +514,24 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static u32 htab_lru_map_gen_lookup(struct bpf_map *map, + struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + const int ret = BPF_REG_0; + + *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); + *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); + *insn++ = BPF_ST_MEM(BPF_B, ret, + offsetof(struct htab_elem, lru_node) + + offsetof(struct bpf_lru_node, ref), + 1); + *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, + offsetof(struct htab_elem, key) + + round_up(map->key_size, 8)); + return insn - insn_buf; +} + /* It is called from the bpf_lru_list when the LRU needs to delete * older elements from the htab. */ @@ -1137,6 +1155,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_lookup_elem = htab_lru_map_lookup_elem, .map_update_elem = htab_lru_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, + .map_gen_lookup = htab_lru_map_gen_lookup, }; /* Called from eBPF program */ -- cgit v1.2.3-73-gaa49b From bb9b9f8802212d98e70c63045b1734162945eaa5 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 31 Aug 2017 23:27:13 -0700 Subject: bpf: Only set node->ref = 1 if it has not been set This patch writes 'node->ref = 1' only if node->ref is 0. The number of lookups/s for a ~1M entries LRU map increased by ~30% (260097 to 343313). Other writes on 'node->ref = 0' is not changed. In those cases, the same cache line has to be changed anyway. First column: Size of the LRU hash Second column: Number of lookups/s Before: > echo "$((2**20+1)): $(./map_perf_test 1024 1 $((2**20+1)) 10000000 | awk '{print $3}')" 1048577: 260097 After: > echo "$((2**20+1)): $(./map_perf_test 1024 1 $((2**20+1)) 10000000 | awk '{print $3}')" 1048577: 343313 Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/bpf_lru_list.h | 3 ++- kernel/bpf/hashtab.c | 7 ++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index 5c35a98d02bf..7d4f89b7cb84 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -69,7 +69,8 @@ static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node) /* ref is an approximation on access frequency. It does not * have to be very accurate. Hence, no protection is used. */ - node->ref = 1; + if (!node->ref) + node->ref = 1; } int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 682f4543fefa..431126f31ea3 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -519,9 +519,14 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map, { struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; + const int ref_reg = BPF_REG_1; *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); - *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); + *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4); + *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret, + offsetof(struct htab_elem, lru_node) + + offsetof(struct bpf_lru_node, ref)); + *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1); *insn++ = BPF_ST_MEM(BPF_B, ret, offsetof(struct htab_elem, lru_node) + offsetof(struct bpf_lru_node, ref), -- cgit v1.2.3-73-gaa49b