From 6bf625a4140f24b490766043b307f8252519578b Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sun, 6 Feb 2022 11:36:56 -0800 Subject: Drivers: hv: vmbus: Rework use of DMA_BIT_MASK(64) Using DMA_BIT_MASK(64) as an initializer for a global variable causes problems with Clang 12.0.1. The compiler doesn't understand that value 64 is excluded from the shift at compile time, resulting in a build error. While this is a compiler problem, avoid the issue by setting up the dma_mask memory as part of struct hv_device, and initialize it using dma_set_mask(). Reported-by: Nathan Chancellor Reported-by: Vitaly Chikunov Reported-by: Jakub Kicinski Fixes: 743b237c3a7b ("scsi: storvsc: Add Isolation VM support for storvsc driver") Signed-off-by: Michael Kelley Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Link: https://lore.kernel.org/r/1644176216-12531-1-git-send-email-mikelley@microsoft.com Signed-off-by: Wei Liu --- include/linux/hyperv.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index f565a8938836..fe2e0179ed51 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1262,6 +1262,7 @@ struct hv_device { struct vmbus_channel *channel; struct kset *channels_kset; struct device_dma_parameters dma_parms; + u64 dma_mask; /* place holder to keep track of the dir for hv device in debugfs */ struct dentry *debug_dir; -- cgit From 4f774c4a65bf3987d1a95c966e884f38c8a942af Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Thu, 27 Jan 2022 19:25:53 -0800 Subject: cpufreq: Reintroduce ready() callback This effectively revert '4bf8e582119e ("cpufreq: Remove ready() callback")', in order to reintroduce the ready callback. This is needed in order to be able to leave the thermal pressure interrupts in the Qualcomm CPUfreq driver disabled during initialization, so that it doesn't fire while related_cpus are still 0. Signed-off-by: Bjorn Andersson [ Viresh: Added the Chinese translation as well and updated commit msg ] Signed-off-by: Viresh Kumar --- Documentation/cpu-freq/cpu-drivers.rst | 3 +++ Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst | 2 ++ drivers/cpufreq/cpufreq.c | 4 ++++ include/linux/cpufreq.h | 3 +++ 4 files changed, 12 insertions(+) (limited to 'include') diff --git a/Documentation/cpu-freq/cpu-drivers.rst b/Documentation/cpu-freq/cpu-drivers.rst index 3b32336a7803..d84ededb66f9 100644 --- a/Documentation/cpu-freq/cpu-drivers.rst +++ b/Documentation/cpu-freq/cpu-drivers.rst @@ -75,6 +75,9 @@ And optionally .resume - A pointer to a per-policy resume function which is called with interrupts disabled and _before_ the governor is started again. + .ready - A pointer to a per-policy ready function which is called after + the policy is fully initialized. + .attr - A pointer to a NULL-terminated list of "struct freq_attr" which allow to export values to sysfs. diff --git a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst index 87a36044f828..2ca92042767b 100644 --- a/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst +++ b/Documentation/translations/zh_CN/cpu-freq/cpu-drivers.rst @@ -84,6 +84,8 @@ CPUfreq核心层注册一个cpufreq_driver结构体。 .resume - 一个指向per-policy恢复函数的指针,该函数在关中断且在调节器再一次启动前被 调用。 + .ready - 一个指向per-policy准备函数的指针,该函数在策略完全初始化之后被调用。 + .attr - 一个指向NULL结尾的"struct freq_attr"列表的指针,该列表允许导出值到 sysfs。 diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index b8d95536ee22..80f535cc8a75 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1518,6 +1518,10 @@ static int cpufreq_online(unsigned int cpu) kobject_uevent(&policy->kobj, KOBJ_ADD); + /* Callback for handling stuff after policy is ready */ + if (cpufreq_driver->ready) + cpufreq_driver->ready(policy); + if (cpufreq_thermal_control_enabled(cpufreq_driver)) policy->cdev = of_cpufreq_cooling_register(policy); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 1ab29e61b078..3522a272b74d 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -382,6 +382,9 @@ struct cpufreq_driver { int (*suspend)(struct cpufreq_policy *policy); int (*resume)(struct cpufreq_policy *policy); + /* Will be called after the driver is fully initialized */ + void (*ready)(struct cpufreq_policy *policy); + struct freq_attr **attr; /* platform specific boost support code */ -- cgit From a8abb0c3dc1e28454851a00f8b7333d9695d566c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 9 Feb 2022 12:33:23 +0530 Subject: bpf: Fix crash due to incorrect copy_map_value When both bpf_spin_lock and bpf_timer are present in a BPF map value, copy_map_value needs to skirt both objects when copying a value into and out of the map. However, the current code does not set both s_off and t_off in copy_map_value, which leads to a crash when e.g. bpf_spin_lock is placed in map value with bpf_timer, as bpf_map_update_elem call will be able to overwrite the other timer object. When the issue is not fixed, an overwriting can produce the following splat: [root@(none) bpf]# ./test_progs -t timer_crash [ 15.930339] bpf_testmod: loading out-of-tree module taints kernel. [ 16.037849] ================================================================== [ 16.038458] BUG: KASAN: user-memory-access in __pv_queued_spin_lock_slowpath+0x32b/0x520 [ 16.038944] Write of size 8 at addr 0000000000043ec0 by task test_progs/325 [ 16.039399] [ 16.039514] CPU: 0 PID: 325 Comm: test_progs Tainted: G OE 5.16.0+ #278 [ 16.039983] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ArchLinux 1.15.0-1 04/01/2014 [ 16.040485] Call Trace: [ 16.040645] [ 16.040805] dump_stack_lvl+0x59/0x73 [ 16.041069] ? __pv_queued_spin_lock_slowpath+0x32b/0x520 [ 16.041427] kasan_report.cold+0x116/0x11b [ 16.041673] ? __pv_queued_spin_lock_slowpath+0x32b/0x520 [ 16.042040] __pv_queued_spin_lock_slowpath+0x32b/0x520 [ 16.042328] ? memcpy+0x39/0x60 [ 16.042552] ? pv_hash+0xd0/0xd0 [ 16.042785] ? lockdep_hardirqs_off+0x95/0xd0 [ 16.043079] __bpf_spin_lock_irqsave+0xdf/0xf0 [ 16.043366] ? bpf_get_current_comm+0x50/0x50 [ 16.043608] ? jhash+0x11a/0x270 [ 16.043848] bpf_timer_cancel+0x34/0xe0 [ 16.044119] bpf_prog_c4ea1c0f7449940d_sys_enter+0x7c/0x81 [ 16.044500] bpf_trampoline_6442477838_0+0x36/0x1000 [ 16.044836] __x64_sys_nanosleep+0x5/0x140 [ 16.045119] do_syscall_64+0x59/0x80 [ 16.045377] ? lock_is_held_type+0xe4/0x140 [ 16.045670] ? irqentry_exit_to_user_mode+0xa/0x40 [ 16.046001] ? mark_held_locks+0x24/0x90 [ 16.046287] ? asm_exc_page_fault+0x1e/0x30 [ 16.046569] ? asm_exc_page_fault+0x8/0x30 [ 16.046851] ? lockdep_hardirqs_on+0x7e/0x100 [ 16.047137] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 16.047405] RIP: 0033:0x7f9e4831718d [ 16.047602] Code: b4 0c 00 0f 05 eb a9 66 0f 1f 44 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d b3 6c 0c 00 f7 d8 64 89 01 48 [ 16.048764] RSP: 002b:00007fff488086b8 EFLAGS: 00000206 ORIG_RAX: 0000000000000023 [ 16.049275] RAX: ffffffffffffffda RBX: 00007f9e48683740 RCX: 00007f9e4831718d [ 16.049747] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00007fff488086d0 [ 16.050225] RBP: 00007fff488086f0 R08: 00007fff488085d7 R09: 00007f9e4cb594a0 [ 16.050648] R10: 0000000000000000 R11: 0000000000000206 R12: 00007f9e484cde30 [ 16.051124] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 16.051608] [ 16.051762] ================================================================== Fixes: 68134668c17f ("bpf: Add map side support for bpf timers.") Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220209070324.1093182-2-memxor@gmail.com --- include/linux/bpf.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fa517ae604ad..31a83449808b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -224,7 +224,8 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) if (unlikely(map_value_has_spin_lock(map))) { s_off = map->spin_lock_off; s_sz = sizeof(struct bpf_spin_lock); - } else if (unlikely(map_value_has_timer(map))) { + } + if (unlikely(map_value_has_timer(map))) { t_off = map->timer_off; t_sz = sizeof(struct bpf_timer); } -- cgit From 5eaed6eedbe9612f642ad2b880f961d1c6c8ec2b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 11 Feb 2022 11:49:53 -0800 Subject: bpf: Fix a bpf_timer initialization issue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The patch in [1] intends to fix a bpf_timer related issue, but the fix caused existing 'timer' selftest to fail with hang or some random errors. After some debug, I found an issue with check_and_init_map_value() in the hashtab.c. More specifically, in hashtab.c, we have code l_new = bpf_map_kmalloc_node(&htab->map, ...) check_and_init_map_value(&htab->map, l_new...) Note that bpf_map_kmalloc_node() does not do initialization so l_new contains random value. The function check_and_init_map_value() intends to zero the bpf_spin_lock and bpf_timer if they exist in the map. But I found bpf_spin_lock is zero'ed but bpf_timer is not zero'ed. With [1], later copy_map_value() skips copying of bpf_spin_lock and bpf_timer. The non-zero bpf_timer caused random failures for 'timer' selftest. Without [1], for both bpf_spin_lock and bpf_timer case, bpf_timer will be zero'ed, so 'timer' self test is okay. For check_and_init_map_value(), why bpf_spin_lock is zero'ed properly while bpf_timer not. In bpf uapi header, we have struct bpf_spin_lock { __u32 val; }; struct bpf_timer { __u64 :64; __u64 :64; } __attribute__((aligned(8))); The initialization code: *(struct bpf_spin_lock *)(dst + map->spin_lock_off) = (struct bpf_spin_lock){}; *(struct bpf_timer *)(dst + map->timer_off) = (struct bpf_timer){}; It appears the compiler has no obligation to initialize anonymous fields. For example, let us use clang with bpf target as below: $ cat t.c struct bpf_timer { unsigned long long :64; }; struct bpf_timer2 { unsigned long long a; }; void test(struct bpf_timer *t) { *t = (struct bpf_timer){}; } void test2(struct bpf_timer2 *t) { *t = (struct bpf_timer2){}; } $ clang -target bpf -O2 -c -g t.c $ llvm-objdump -d t.o ... 0000000000000000 : 0: 95 00 00 00 00 00 00 00 exit 0000000000000008 : 1: b7 02 00 00 00 00 00 00 r2 = 0 2: 7b 21 00 00 00 00 00 00 *(u64 *)(r1 + 0) = r2 3: 95 00 00 00 00 00 00 00 exit gcc11.2 does not have the above issue. But from INTERNATIONAL STANDARD ©ISO/IEC ISO/IEC 9899:201x Programming languages — C http://www.open-std.org/Jtc1/sc22/wg14/www/docs/n1547.pdf page 157: Except where explicitly stated otherwise, for the purposes of this subclause unnamed members of objects of structure and union type do not participate in initialization. Unnamed members of structure objects have indeterminate value even after initialization. To fix the problem, let use memset for bpf_timer case in check_and_init_map_value(). For consistency, memset is also used for bpf_spin_lock case. [1] https://lore.kernel.org/bpf/20220209070324.1093182-2-memxor@gmail.com/ Fixes: 68134668c17f3 ("bpf: Add map side support for bpf timers.") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20220211194953.3142152-1-yhs@fb.com --- include/linux/bpf.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 31a83449808b..d0ad379d1e62 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -209,11 +209,9 @@ static inline bool map_value_has_timer(const struct bpf_map *map) static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { if (unlikely(map_value_has_spin_lock(map))) - *(struct bpf_spin_lock *)(dst + map->spin_lock_off) = - (struct bpf_spin_lock){}; + memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock)); if (unlikely(map_value_has_timer(map))) - *(struct bpf_timer *)(dst + map->timer_off) = - (struct bpf_timer){}; + memset(dst + map->timer_off, 0, sizeof(struct bpf_timer)); } /* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */ -- cgit From ddbd89deb7d32b1fbb879f48d68fda1a8ac58e8e Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Fri, 11 Feb 2022 02:12:52 +0100 Subject: swiotlb: fix info leak with DMA_FROM_DEVICE The problem I'm addressing was discovered by the LTP test covering cve-2018-1000204. A short description of what happens follows: 1) The test case issues a command code 00 (TEST UNIT READY) via the SG_IO interface with: dxfer_len == 524288, dxdfer_dir == SG_DXFER_FROM_DEV and a corresponding dxferp. The peculiar thing about this is that TUR is not reading from the device. 2) In sg_start_req() the invocation of blk_rq_map_user() effectively bounces the user-space buffer. As if the device was to transfer into it. Since commit a45b599ad808 ("scsi: sg: allocate with __GFP_ZERO in sg_build_indirect()") we make sure this first bounce buffer is allocated with GFP_ZERO. 3) For the rest of the story we keep ignoring that we have a TUR, so the device won't touch the buffer we prepare as if the we had a DMA_FROM_DEVICE type of situation. My setup uses a virtio-scsi device and the buffer allocated by SG is mapped by the function virtqueue_add_split() which uses DMA_FROM_DEVICE for the "in" sgs (here scatter-gather and not scsi generics). This mapping involves bouncing via the swiotlb (we need swiotlb to do virtio in protected guest like s390 Secure Execution, or AMD SEV). 4) When the SCSI TUR is done, we first copy back the content of the second (that is swiotlb) bounce buffer (which most likely contains some previous IO data), to the first bounce buffer, which contains all zeros. Then we copy back the content of the first bounce buffer to the user-space buffer. 5) The test case detects that the buffer, which it zero-initialized, ain't all zeros and fails. One can argue that this is an swiotlb problem, because without swiotlb we leak all zeros, and the swiotlb should be transparent in a sense that it does not affect the outcome (if all other participants are well behaved). Copying the content of the original buffer into the swiotlb buffer is the only way I can think of to make swiotlb transparent in such scenarios. So let's do just that if in doubt, but allow the driver to tell us that the whole mapped buffer is going to be overwritten, in which case we can preserve the old behavior and avoid the performance impact of the extra bounce. Signed-off-by: Halil Pasic Signed-off-by: Christoph Hellwig --- Documentation/core-api/dma-attributes.rst | 8 ++++++++ include/linux/dma-mapping.h | 8 ++++++++ kernel/dma/swiotlb.c | 3 ++- 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 1887d92e8e92..17706dc91ec9 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -130,3 +130,11 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged subsystem that the buffer is fully accessible at the elevated privilege level (and ideally inaccessible or at least read-only at the lesser-privileged levels). + +DMA_ATTR_OVERWRITE +------------------ + +This is a hint to the DMA-mapping subsystem that the device is expected to +overwrite the entire mapped size, thus the caller does not require any of the +previous buffer contents to be preserved. This allows bounce-buffering +implementations to optimise DMA_FROM_DEVICE transfers. diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index dca2b1355bb1..6150d11a607e 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -61,6 +61,14 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) +/* + * This is a hint to the DMA-mapping subsystem that the device is expected + * to overwrite the entire mapped size, thus the caller does not require any + * of the previous buffer contents to be preserved. This allows + * bounce-buffering implementations to optimise DMA_FROM_DEVICE transfers. + */ +#define DMA_ATTR_OVERWRITE (1UL << 10) + /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index f1e7ea160b43..bfc56cb21705 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -628,7 +628,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(mem->start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) + (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE || + dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } -- cgit From 26394fc118d6115390bd5b3a0fb17096271da227 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Fri, 11 Feb 2022 17:30:42 +0000 Subject: ipv6: mcast: use rcu-safe version of ipv6_get_lladdr() Some time ago 8965779d2c0e ("ipv6,mcast: always hold idev->lock before mca_lock") switched ipv6_get_lladdr() to __ipv6_get_lladdr(), which is rcu-unsafe version. That was OK, because idev->lock was held for these codepaths. In 88e2ca308094 ("mld: convert ifmcaddr6 to RCU") these external locks were removed, so we probably need to restore the original rcu-safe call. Otherwise, we occasionally get a machine crashed/stalled with the following in dmesg: [ 3405.966610][T230589] general protection fault, probably for non-canonical address 0xdead00000000008c: 0000 [#1] SMP NOPTI [ 3405.982083][T230589] CPU: 44 PID: 230589 Comm: kworker/44:3 Tainted: G O 5.15.19-cloudflare-2022.2.1 #1 [ 3405.998061][T230589] Hardware name: SUPA-COOL-SERV [ 3406.009552][T230589] Workqueue: mld mld_ifc_work [ 3406.017224][T230589] RIP: 0010:__ipv6_get_lladdr+0x34/0x60 [ 3406.025780][T230589] Code: 57 10 48 83 c7 08 48 89 e5 48 39 d7 74 3e 48 8d 82 38 ff ff ff eb 13 48 8b 90 d0 00 00 00 48 8d 82 38 ff ff ff 48 39 d7 74 22 <66> 83 78 32 20 77 1b 75 e4 89 ca 23 50 2c 75 dd 48 8b 50 08 48 8b [ 3406.055748][T230589] RSP: 0018:ffff94e4b3fc3d10 EFLAGS: 00010202 [ 3406.065617][T230589] RAX: dead00000000005a RBX: ffff94e4b3fc3d30 RCX: 0000000000000040 [ 3406.077477][T230589] RDX: dead000000000122 RSI: ffff94e4b3fc3d30 RDI: ffff8c3a31431008 [ 3406.089389][T230589] RBP: ffff94e4b3fc3d10 R08: 0000000000000000 R09: 0000000000000000 [ 3406.101445][T230589] R10: ffff8c3a31430000 R11: 000000000000000b R12: ffff8c2c37887100 [ 3406.113553][T230589] R13: ffff8c3a39537000 R14: 00000000000005dc R15: ffff8c3a31431000 [ 3406.125730][T230589] FS: 0000000000000000(0000) GS:ffff8c3b9fc80000(0000) knlGS:0000000000000000 [ 3406.138992][T230589] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3406.149895][T230589] CR2: 00007f0dfea1db60 CR3: 000000387b5f2000 CR4: 0000000000350ee0 [ 3406.162421][T230589] Call Trace: [ 3406.170235][T230589] [ 3406.177736][T230589] mld_newpack+0xfe/0x1a0 [ 3406.186686][T230589] add_grhead+0x87/0xa0 [ 3406.195498][T230589] add_grec+0x485/0x4e0 [ 3406.204310][T230589] ? newidle_balance+0x126/0x3f0 [ 3406.214024][T230589] mld_ifc_work+0x15d/0x450 [ 3406.223279][T230589] process_one_work+0x1e6/0x380 [ 3406.232982][T230589] worker_thread+0x50/0x3a0 [ 3406.242371][T230589] ? rescuer_thread+0x360/0x360 [ 3406.252175][T230589] kthread+0x127/0x150 [ 3406.261197][T230589] ? set_kthread_struct+0x40/0x40 [ 3406.271287][T230589] ret_from_fork+0x22/0x30 [ 3406.280812][T230589] [ 3406.288937][T230589] Modules linked in: ... [last unloaded: kheaders] [ 3406.476714][T230589] ---[ end trace 3525a7655f2f3b9e ]--- Fixes: 88e2ca308094 ("mld: convert ifmcaddr6 to RCU") Reported-by: David Pinilla Caparros Signed-off-by: Ignat Korchagin Signed-off-by: David S. Miller --- include/net/addrconf.h | 2 -- net/ipv6/addrconf.c | 4 ++-- net/ipv6/mcast.c | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index e7ce719838b5..59940e230b78 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -109,8 +109,6 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, int ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr); -int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, - u32 banned_flags); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f927c199a93c..3f23da8c0b10 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1839,8 +1839,8 @@ out: } EXPORT_SYMBOL(ipv6_dev_get_saddr); -int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, - u32 banned_flags) +static int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, + u32 banned_flags) { struct inet6_ifaddr *ifp; int err = -EADDRNOTAVAIL; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index bed8155508c8..a8861db52c18 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1759,7 +1759,7 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) skb_reserve(skb, hlen); skb_tailroom_reserve(skb, mtu, tlen); - if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) { + if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* : * use unspecified address as the source address * when a valid link-local address is not available. -- cgit From a2614140dc0f467a83aa3bb4b6ee2d6480a76202 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 11 Feb 2022 19:45:06 +0200 Subject: net: dsa: mv88e6xxx: flush switchdev FDB workqueue before removing VLAN mv88e6xxx is special among DSA drivers in that it requires the VTU to contain the VID of the FDB entry it modifies in mv88e6xxx_port_db_load_purge(), otherwise it will return -EOPNOTSUPP. Sometimes due to races this is not always satisfied even if external code does everything right (first deletes the FDB entries, then the VLAN), because DSA commits to hardware FDB entries asynchronously since commit c9eb3e0f8701 ("net: dsa: Add support for learning FDB through notification"). Therefore, the mv88e6xxx driver must close this race condition by itself, by asking DSA to flush the switchdev workqueue of any FDB deletions in progress, prior to exiting a VLAN. Fixes: c9eb3e0f8701 ("net: dsa: Add support for learning FDB through notification") Reported-by: Rafael Richter Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.c | 7 +++++++ include/net/dsa.h | 1 + net/dsa/dsa.c | 1 + net/dsa/dsa_priv.h | 1 - 4 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 8530dbe403f4..ab1676553714 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -2284,6 +2284,13 @@ static int mv88e6xxx_port_vlan_del(struct dsa_switch *ds, int port, if (!mv88e6xxx_max_vid(chip)) return -EOPNOTSUPP; + /* The ATU removal procedure needs the FID to be mapped in the VTU, + * but FDB deletion runs concurrently with VLAN deletion. Flush the DSA + * switchdev workqueue to ensure that all FDB entries are deleted + * before we remove the VLAN. + */ + dsa_flush_workqueue(); + mv88e6xxx_reg_lock(chip); err = mv88e6xxx_port_get_pvid(chip, port, &pvid); diff --git a/include/net/dsa.h b/include/net/dsa.h index 57b3e4e7413b..85a5ba3772f5 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1187,6 +1187,7 @@ void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds); void dsa_switch_shutdown(struct dsa_switch *ds); struct dsa_switch *dsa_switch_find(int tree_index, int sw_index); +void dsa_flush_workqueue(void); #ifdef CONFIG_PM_SLEEP int dsa_switch_suspend(struct dsa_switch *ds); int dsa_switch_resume(struct dsa_switch *ds); diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index d9d0d227092c..c43f7446a75d 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -349,6 +349,7 @@ void dsa_flush_workqueue(void) { flush_workqueue(dsa_owq); } +EXPORT_SYMBOL_GPL(dsa_flush_workqueue); int dsa_devlink_param_get(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 760306f0012f..23c79e91ac67 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -147,7 +147,6 @@ void dsa_tag_driver_put(const struct dsa_device_ops *ops); const struct dsa_device_ops *dsa_find_tagger_by_name(const char *buf); bool dsa_schedule_work(struct work_struct *work); -void dsa_flush_workqueue(void); const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops); static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops) -- cgit From 5891cd5ec46c2c2eb6427cb54d214b149635dd0e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 11 Feb 2022 12:06:23 -0800 Subject: net_sched: add __rcu annotation to netdev->qdisc syzbot found a data-race [1] which lead me to add __rcu annotations to netdev->qdisc, and proper accessors to get LOCKDEP support. [1] BUG: KCSAN: data-race in dev_activate / qdisc_lookup_rcu write to 0xffff888168ad6410 of 8 bytes by task 13559 on cpu 1: attach_default_qdiscs net/sched/sch_generic.c:1167 [inline] dev_activate+0x2ed/0x8f0 net/sched/sch_generic.c:1221 __dev_open+0x2e9/0x3a0 net/core/dev.c:1416 __dev_change_flags+0x167/0x3f0 net/core/dev.c:8139 rtnl_configure_link+0xc2/0x150 net/core/rtnetlink.c:3150 __rtnl_newlink net/core/rtnetlink.c:3489 [inline] rtnl_newlink+0xf4d/0x13e0 net/core/rtnetlink.c:3529 rtnetlink_rcv_msg+0x745/0x7e0 net/core/rtnetlink.c:5594 netlink_rcv_skb+0x14e/0x250 net/netlink/af_netlink.c:2494 rtnetlink_rcv+0x18/0x20 net/core/rtnetlink.c:5612 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x602/0x6d0 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x728/0x850 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmsg+0x195/0x230 net/socket.c:2496 __do_sys_sendmsg net/socket.c:2505 [inline] __se_sys_sendmsg net/socket.c:2503 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2503 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff888168ad6410 of 8 bytes by task 13560 on cpu 0: qdisc_lookup_rcu+0x30/0x2e0 net/sched/sch_api.c:323 __tcf_qdisc_find+0x74/0x3a0 net/sched/cls_api.c:1050 tc_del_tfilter+0x1c7/0x1350 net/sched/cls_api.c:2211 rtnetlink_rcv_msg+0x5ba/0x7e0 net/core/rtnetlink.c:5585 netlink_rcv_skb+0x14e/0x250 net/netlink/af_netlink.c:2494 rtnetlink_rcv+0x18/0x20 net/core/rtnetlink.c:5612 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x602/0x6d0 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x728/0x850 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmsg+0x195/0x230 net/socket.c:2496 __do_sys_sendmsg net/socket.c:2505 [inline] __se_sys_sendmsg net/socket.c:2503 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2503 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0xffffffff85dee080 -> 0xffff88815d96ec00 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 13560 Comm: syz-executor.2 Not tainted 5.17.0-rc3-syzkaller-00116-gf1baf68e1383-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 470502de5bdb ("net: sched: unlock rules update API") Signed-off-by: Eric Dumazet Cc: Vlad Buslov Reported-by: syzbot Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- net/core/rtnetlink.c | 6 ++++-- net/sched/cls_api.c | 6 +++--- net/sched/sch_api.c | 22 ++++++++++++---------- net/sched/sch_generic.c | 29 ++++++++++++++++------------- 5 files changed, 36 insertions(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e490b84732d1..8b5a314db167 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2158,7 +2158,7 @@ struct net_device { struct netdev_queue *_tx ____cacheline_aligned_in_smp; unsigned int num_tx_queues; unsigned int real_num_tx_queues; - struct Qdisc *qdisc; + struct Qdisc __rcu *qdisc; unsigned int tx_queue_len; spinlock_t tx_global_lock; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 710da8a36729..2fb8eb6791e8 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1699,6 +1699,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, { struct ifinfomsg *ifm; struct nlmsghdr *nlh; + struct Qdisc *qdisc; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); @@ -1716,6 +1717,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; + qdisc = rtnl_dereference(dev->qdisc); if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || nla_put_u8(skb, IFLA_OPERSTATE, @@ -1735,8 +1737,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, #endif put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || - (dev->qdisc && - nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || + (qdisc && + nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 5f0f346b576f..5ce1208a6ea3 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1044,7 +1044,7 @@ static int __tcf_qdisc_find(struct net *net, struct Qdisc **q, /* Find qdisc */ if (!*parent) { - *q = dev->qdisc; + *q = rcu_dereference(dev->qdisc); *parent = (*q)->handle; } else { *q = qdisc_lookup_rcu(dev, TC_H_MAJ(*parent)); @@ -2587,7 +2587,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) parent = tcm->tcm_parent; if (!parent) - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); else q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); if (!q) @@ -2962,7 +2962,7 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; if (!tcm->tcm_parent) - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); else q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 179825a3b2fd..e3c0e8ea2dbb 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -301,7 +301,7 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) if (!handle) return NULL; - q = qdisc_match_from_root(dev->qdisc, handle); + q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle); if (q) goto out; @@ -320,7 +320,7 @@ struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle) if (!handle) return NULL; - q = qdisc_match_from_root(dev->qdisc, handle); + q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle); if (q) goto out; @@ -1082,10 +1082,10 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, skip: if (!ingress) { notify_and_destroy(net, skb, n, classid, - dev->qdisc, new); + rtnl_dereference(dev->qdisc), new); if (new && !new->ops->attach) qdisc_refcount_inc(new); - dev->qdisc = new ? : &noop_qdisc; + rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc); if (new && new->ops->attach) new->ops->attach(new); @@ -1451,7 +1451,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, q = dev_ingress_queue(dev)->qdisc_sleeping; } } else { - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); } if (!q) { NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device"); @@ -1540,7 +1540,7 @@ replay: q = dev_ingress_queue(dev)->qdisc_sleeping; } } else { - q = dev->qdisc; + q = rtnl_dereference(dev->qdisc); } /* It may be default qdisc, ignore it */ @@ -1762,7 +1762,8 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) s_q_idx = 0; q_idx = 0; - if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx, + if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc), + skb, cb, &q_idx, s_q_idx, true, tca[TCA_DUMP_INVISIBLE]) < 0) goto done; @@ -2033,7 +2034,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, } else if (qid1) { qid = qid1; } else if (qid == 0) - qid = dev->qdisc->handle; + qid = rtnl_dereference(dev->qdisc)->handle; /* Now qid is genuine qdisc handle consistent * both with parent and child. @@ -2044,7 +2045,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, portid = TC_H_MAKE(qid, portid); } else { if (qid == 0) - qid = dev->qdisc->handle; + qid = rtnl_dereference(dev->qdisc)->handle; } /* OK. Locate qdisc */ @@ -2205,7 +2206,8 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; t = 0; - if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t, true) < 0) + if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc), + skb, tcm, cb, &t, s_t, true) < 0) goto done; dev_queue = dev_ingress_queue(dev); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index f893d9a81b01..5bab9f8b8f45 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -1164,30 +1164,33 @@ static void attach_default_qdiscs(struct net_device *dev) if (!netif_is_multiqueue(dev) || dev->priv_flags & IFF_NO_QUEUE) { netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); - dev->qdisc = txq->qdisc_sleeping; - qdisc_refcount_inc(dev->qdisc); + qdisc = txq->qdisc_sleeping; + rcu_assign_pointer(dev->qdisc, qdisc); + qdisc_refcount_inc(qdisc); } else { qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL); if (qdisc) { - dev->qdisc = qdisc; + rcu_assign_pointer(dev->qdisc, qdisc); qdisc->ops->attach(qdisc); } } + qdisc = rtnl_dereference(dev->qdisc); /* Detect default qdisc setup/init failed and fallback to "noqueue" */ - if (dev->qdisc == &noop_qdisc) { + if (qdisc == &noop_qdisc) { netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n", default_qdisc_ops->id, noqueue_qdisc_ops.id); dev->priv_flags |= IFF_NO_QUEUE; netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); - dev->qdisc = txq->qdisc_sleeping; - qdisc_refcount_inc(dev->qdisc); + qdisc = txq->qdisc_sleeping; + rcu_assign_pointer(dev->qdisc, qdisc); + qdisc_refcount_inc(qdisc); dev->priv_flags ^= IFF_NO_QUEUE; } #ifdef CONFIG_NET_SCHED - if (dev->qdisc != &noop_qdisc) - qdisc_hash_add(dev->qdisc, false); + if (qdisc != &noop_qdisc) + qdisc_hash_add(qdisc, false); #endif } @@ -1217,7 +1220,7 @@ void dev_activate(struct net_device *dev) * and noqueue_qdisc for virtual interfaces */ - if (dev->qdisc == &noop_qdisc) + if (rtnl_dereference(dev->qdisc) == &noop_qdisc) attach_default_qdiscs(dev); if (!netif_carrier_ok(dev)) @@ -1383,7 +1386,7 @@ static int qdisc_change_tx_queue_len(struct net_device *dev, void dev_qdisc_change_real_num_tx(struct net_device *dev, unsigned int new_real_tx) { - struct Qdisc *qdisc = dev->qdisc; + struct Qdisc *qdisc = rtnl_dereference(dev->qdisc); if (qdisc->ops->change_real_num_tx) qdisc->ops->change_real_num_tx(qdisc, new_real_tx); @@ -1447,7 +1450,7 @@ static void dev_init_scheduler_queue(struct net_device *dev, void dev_init_scheduler(struct net_device *dev) { - dev->qdisc = &noop_qdisc; + rcu_assign_pointer(dev->qdisc, &noop_qdisc); netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); @@ -1475,8 +1478,8 @@ void dev_shutdown(struct net_device *dev) netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); if (dev_ingress_queue(dev)) shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); - qdisc_put(dev->qdisc); - dev->qdisc = &noop_qdisc; + qdisc_put(rtnl_dereference(dev->qdisc)); + rcu_assign_pointer(dev->qdisc, &noop_qdisc); WARN_ON(timer_pending(&dev->watchdog_timer)); } -- cgit From 9ceaf6f76b203682bb6100e14b3d7da4c0bedde8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Feb 2022 11:15:53 -0800 Subject: bonding: fix data-races around agg_select_timer syzbot reported that two threads might write over agg_select_timer at the same time. Make agg_select_timer atomic to fix the races. BUG: KCSAN: data-race in bond_3ad_initiate_agg_selection / bond_3ad_state_machine_handler read to 0xffff8881242aea90 of 4 bytes by task 1846 on cpu 1: bond_3ad_state_machine_handler+0x99/0x2810 drivers/net/bonding/bond_3ad.c:2317 process_one_work+0x3f6/0x960 kernel/workqueue.c:2307 worker_thread+0x616/0xa70 kernel/workqueue.c:2454 kthread+0x1bf/0x1e0 kernel/kthread.c:377 ret_from_fork+0x1f/0x30 write to 0xffff8881242aea90 of 4 bytes by task 25910 on cpu 0: bond_3ad_initiate_agg_selection+0x18/0x30 drivers/net/bonding/bond_3ad.c:1998 bond_open+0x658/0x6f0 drivers/net/bonding/bond_main.c:3967 __dev_open+0x274/0x3a0 net/core/dev.c:1407 dev_open+0x54/0x190 net/core/dev.c:1443 bond_enslave+0xcef/0x3000 drivers/net/bonding/bond_main.c:1937 do_set_master net/core/rtnetlink.c:2532 [inline] do_setlink+0x94f/0x2500 net/core/rtnetlink.c:2736 __rtnl_newlink net/core/rtnetlink.c:3414 [inline] rtnl_newlink+0xfeb/0x13e0 net/core/rtnetlink.c:3529 rtnetlink_rcv_msg+0x745/0x7e0 net/core/rtnetlink.c:5594 netlink_rcv_skb+0x14e/0x250 net/netlink/af_netlink.c:2494 rtnetlink_rcv+0x18/0x20 net/core/rtnetlink.c:5612 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x602/0x6d0 net/netlink/af_netlink.c:1343 netlink_sendmsg+0x728/0x850 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmsg+0x195/0x230 net/socket.c:2496 __do_sys_sendmsg net/socket.c:2505 [inline] __se_sys_sendmsg net/socket.c:2503 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2503 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x00000050 -> 0x0000004f Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 25910 Comm: syz-executor.1 Tainted: G W 5.17.0-rc4-syzkaller-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Jay Vosburgh Cc: Veaceslav Falico Signed-off-by: David S. Miller --- drivers/net/bonding/bond_3ad.c | 30 +++++++++++++++++++++++++----- include/net/bond_3ad.h | 2 +- 2 files changed, 26 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 9fd1d6cba3cd..a86b1f71762e 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -225,7 +225,7 @@ static inline int __check_agg_selection_timer(struct port *port) if (bond == NULL) return 0; - return BOND_AD_INFO(bond).agg_select_timer ? 1 : 0; + return atomic_read(&BOND_AD_INFO(bond).agg_select_timer) ? 1 : 0; } /** @@ -1995,7 +1995,7 @@ static void ad_marker_response_received(struct bond_marker *marker, */ void bond_3ad_initiate_agg_selection(struct bonding *bond, int timeout) { - BOND_AD_INFO(bond).agg_select_timer = timeout; + atomic_set(&BOND_AD_INFO(bond).agg_select_timer, timeout); } /** @@ -2278,6 +2278,28 @@ void bond_3ad_update_ad_actor_settings(struct bonding *bond) spin_unlock_bh(&bond->mode_lock); } +/** + * bond_agg_timer_advance - advance agg_select_timer + * @bond: bonding structure + * + * Return true when agg_select_timer reaches 0. + */ +static bool bond_agg_timer_advance(struct bonding *bond) +{ + int val, nval; + + while (1) { + val = atomic_read(&BOND_AD_INFO(bond).agg_select_timer); + if (!val) + return false; + nval = val - 1; + if (atomic_cmpxchg(&BOND_AD_INFO(bond).agg_select_timer, + val, nval) == val) + break; + } + return nval == 0; +} + /** * bond_3ad_state_machine_handler - handle state machines timeout * @work: work context to fetch bonding struct to work on from @@ -2313,9 +2335,7 @@ void bond_3ad_state_machine_handler(struct work_struct *work) if (!bond_has_slaves(bond)) goto re_arm; - /* check if agg_select_timer timer after initialize is timed out */ - if (BOND_AD_INFO(bond).agg_select_timer && - !(--BOND_AD_INFO(bond).agg_select_timer)) { + if (bond_agg_timer_advance(bond)) { slave = bond_first_slave_rcu(bond); port = slave ? &(SLAVE_AD_INFO(slave)->port) : NULL; diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index 38785d48baff..184105d68294 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -262,7 +262,7 @@ struct ad_system { struct ad_bond_info { struct ad_system system; /* 802.3ad system structure */ struct bond_3ad_stats stats; - u32 agg_select_timer; /* Timer to select aggregator after all adapter's hand shakes */ + atomic_t agg_select_timer; /* Timer to select aggregator after all adapter's hand shakes */ u16 aggregator_identifier; }; -- cgit From 0b0dff5b3b98c5c7ce848151df9da0b3cdf0cc8b Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 15 Feb 2022 11:00:37 -0500 Subject: ipv6: per-netns exclusive flowlabel checks Ipv6 flowlabels historically require a reservation before use. Optionally in exclusive mode (e.g., user-private). Commit 59c820b2317f ("ipv6: elide flowlabel check if no exclusive leases exist") introduced a fastpath that avoids this check when no exclusive leases exist in the system, and thus any flowlabel use will be granted. That allows skipping the control operation to reserve a flowlabel entirely. Though with a warning if the fast path fails: This is an optimization. Robust applications still have to revert to requesting leases if the fast path fails due to an exclusive lease. Still, this is subtle. Better isolate network namespaces from each other. Flowlabels are per-netns. Also record per-netns whether exclusive leases are in use. Then behavior does not change based on activity in other netns. Changes v2 - wrap in IS_ENABLED(CONFIG_IPV6) to avoid breakage if disabled Fixes: 59c820b2317f ("ipv6: elide flowlabel check if no exclusive leases exist") Link: https://lore.kernel.org/netdev/MWHPR2201MB1072BCCCFCE779E4094837ACD0329@MWHPR2201MB1072.namprd22.prod.outlook.com/ Reported-by: Congyu Liu Signed-off-by: Willem de Bruijn Tested-by: Congyu Liu Link: https://lore.kernel.org/r/20220215160037.1976072-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 5 ++++- include/net/netns/ipv6.h | 3 ++- net/ipv6/ip6_flowlabel.c | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 3afcb128e064..92eec13d1693 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -393,17 +393,20 @@ static inline void txopt_put(struct ipv6_txoptions *opt) kfree_rcu(opt, rcu); } +#if IS_ENABLED(CONFIG_IPV6) struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label); extern struct static_key_false_deferred ipv6_flowlabel_exclusive; static inline struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) { - if (static_branch_unlikely(&ipv6_flowlabel_exclusive.key)) + if (static_branch_unlikely(&ipv6_flowlabel_exclusive.key) && + READ_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl)) return __fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT); return NULL; } +#endif struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index a4b550380316..6bd7e5a85ce7 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -77,9 +77,10 @@ struct netns_ipv6 { spinlock_t fib6_gc_lock; unsigned int ip6_rt_gc_expire; unsigned long ip6_rt_last_gc; + unsigned char flowlabel_has_excl; #ifdef CONFIG_IPV6_MULTIPLE_TABLES - unsigned int fib6_rules_require_fldissect; bool fib6_has_custom_rules; + unsigned int fib6_rules_require_fldissect; #ifdef CONFIG_IPV6_SUBTREES unsigned int fib6_routes_require_src; #endif diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index aa673a6a7e43..ceb85c67ce39 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -450,8 +450,10 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, err = -EINVAL; goto done; } - if (fl_shared_exclusive(fl) || fl->opt) + if (fl_shared_exclusive(fl) || fl->opt) { + WRITE_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl, 1); static_branch_deferred_inc(&ipv6_flowlabel_exclusive); + } return fl; done: -- cgit From 7a5428dcb7902700b830e912feee4e845df7c019 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Feb 2022 08:52:31 +0100 Subject: block: fix surprise removal for drivers calling blk_set_queue_dying MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various block drivers call blk_set_queue_dying to mark a disk as dead due to surprise removal events, but since commit 8e141f9eb803 that doesn't work given that the GD_DEAD flag needs to be set to stop I/O. Replace the driver calls to blk_set_queue_dying with a new (and properly documented) blk_mark_disk_dead API, and fold blk_set_queue_dying into the only remaining caller. Fixes: 8e141f9eb803 ("block: drain file system I/O on del_gendisk") Reported-by: Markus Blöchl Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Link: https://lore.kernel.org/r/20220217075231.1140-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 10 ++-------- block/genhd.c | 14 ++++++++++++++ drivers/block/mtip32xx/mtip32xx.c | 2 +- drivers/block/rbd.c | 2 +- drivers/block/xen-blkfront.c | 2 +- drivers/md/dm.c | 2 +- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/multipath.c | 2 +- include/linux/blkdev.h | 3 ++- 9 files changed, 24 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/block/blk-core.c b/block/blk-core.c index d93e3bb9a769..1039515c99d6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -284,13 +284,6 @@ void blk_queue_start_drain(struct request_queue *q) wake_up_all(&q->mq_freeze_wq); } -void blk_set_queue_dying(struct request_queue *q) -{ - blk_queue_flag_set(QUEUE_FLAG_DYING, q); - blk_queue_start_drain(q); -} -EXPORT_SYMBOL_GPL(blk_set_queue_dying); - /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown @@ -308,7 +301,8 @@ void blk_cleanup_queue(struct request_queue *q) WARN_ON_ONCE(blk_queue_registered(q)); /* mark @q DYING, no new request or merges will be allowed afterwards */ - blk_set_queue_dying(q); + blk_queue_flag_set(QUEUE_FLAG_DYING, q); + blk_queue_start_drain(q); blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); diff --git a/block/genhd.c b/block/genhd.c index 626c8406f21a..9eca1f7d35c9 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -548,6 +548,20 @@ out_free_ext_minor: } EXPORT_SYMBOL(device_add_disk); +/** + * blk_mark_disk_dead - mark a disk as dead + * @disk: disk to mark as dead + * + * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O + * to this disk. + */ +void blk_mark_disk_dead(struct gendisk *disk) +{ + set_bit(GD_DEAD, &disk->state); + blk_queue_start_drain(disk->queue); +} +EXPORT_SYMBOL_GPL(blk_mark_disk_dead); + /** * del_gendisk - remove the gendisk * @disk: the struct gendisk to remove diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index e6005c232328..2b588b62cbbb 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -4112,7 +4112,7 @@ static void mtip_pci_remove(struct pci_dev *pdev) "Completion workers still active!\n"); } - blk_set_queue_dying(dd->queue); + blk_mark_disk_dead(dd->disk); set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag); /* Clean up the block layer. */ diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4203cdab8abf..b844432bad20 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -7185,7 +7185,7 @@ static ssize_t do_rbd_remove(struct bus_type *bus, * IO to complete/fail. */ blk_mq_freeze_queue(rbd_dev->disk->queue); - blk_set_queue_dying(rbd_dev->disk->queue); + blk_mark_disk_dead(rbd_dev->disk); } del_gendisk(rbd_dev->disk); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index ccd0dd0c6b83..ca71a0585333 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2126,7 +2126,7 @@ static void blkfront_closing(struct blkfront_info *info) /* No more blkif_request(). */ blk_mq_stop_hw_queues(info->rq); - blk_set_queue_dying(info->rq); + blk_mark_disk_dead(info->gd); set_capacity(info->gd, 0); for_each_rinfo(info, rinfo, i) { diff --git a/drivers/md/dm.c b/drivers/md/dm.c index dcbd6d201619..997ace47bbd5 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2077,7 +2077,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait) set_bit(DMF_FREEING, &md->flags); spin_unlock(&_minor_lock); - blk_set_queue_dying(md->queue); + blk_mark_disk_dead(md->disk); /* * Take suspend_lock so that presuspend and postsuspend methods diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 79005ea1a33e..469f23186159 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4574,7 +4574,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns) if (test_and_set_bit(NVME_NS_DEAD, &ns->flags)) return; - blk_set_queue_dying(ns->queue); + blk_mark_disk_dead(ns->disk); nvme_start_ns_queue(ns); set_capacity_and_notify(ns->disk, 0); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index f8bf6606eb2f..ff775235534c 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -848,7 +848,7 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) { if (!head->disk) return; - blk_set_queue_dying(head->disk->queue); + blk_mark_disk_dead(head->disk); /* make sure all pending bios are cleaned up */ kblockd_schedule_work(&head->requeue_work); flush_work(&head->requeue_work); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f35aea98bc35..16b47035e4b0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -748,7 +748,8 @@ extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q, bool __must_check blk_get_queue(struct request_queue *); extern void blk_put_queue(struct request_queue *); -extern void blk_set_queue_dying(struct request_queue *); + +void blk_mark_disk_dead(struct gendisk *disk); #ifdef CONFIG_BLOCK /* -- cgit From d95d6320ba7a51d61c097ffc3bcafcf70283414e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Feb 2022 09:32:17 -0800 Subject: ipv6: fix data-race in fib6_info_hw_flags_set / fib6_purge_rt Because fib6_info_hw_flags_set() is called without any synchronization, all accesses to gi6->offload, fi->trap and fi->offload_failed need some basic protection like READ_ONCE()/WRITE_ONCE(). BUG: KCSAN: data-race in fib6_info_hw_flags_set / fib6_purge_rt read to 0xffff8881087d5886 of 1 bytes by task 13953 on cpu 0: fib6_drop_pcpu_from net/ipv6/ip6_fib.c:1007 [inline] fib6_purge_rt+0x4f/0x580 net/ipv6/ip6_fib.c:1033 fib6_del_route net/ipv6/ip6_fib.c:1983 [inline] fib6_del+0x696/0x890 net/ipv6/ip6_fib.c:2028 __ip6_del_rt net/ipv6/route.c:3876 [inline] ip6_del_rt+0x83/0x140 net/ipv6/route.c:3891 __ipv6_dev_ac_dec+0x2b5/0x370 net/ipv6/anycast.c:374 ipv6_dev_ac_dec net/ipv6/anycast.c:387 [inline] __ipv6_sock_ac_close+0x141/0x200 net/ipv6/anycast.c:207 ipv6_sock_ac_close+0x79/0x90 net/ipv6/anycast.c:220 inet6_release+0x32/0x50 net/ipv6/af_inet6.c:476 __sock_release net/socket.c:650 [inline] sock_close+0x6c/0x150 net/socket.c:1318 __fput+0x295/0x520 fs/file_table.c:280 ____fput+0x11/0x20 fs/file_table.c:313 task_work_run+0x8e/0x110 kernel/task_work.c:164 tracehook_notify_resume include/linux/tracehook.h:189 [inline] exit_to_user_mode_loop kernel/entry/common.c:175 [inline] exit_to_user_mode_prepare+0x160/0x190 kernel/entry/common.c:207 __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline] syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:300 do_syscall_64+0x50/0xd0 arch/x86/entry/common.c:86 entry_SYSCALL_64_after_hwframe+0x44/0xae write to 0xffff8881087d5886 of 1 bytes by task 1912 on cpu 1: fib6_info_hw_flags_set+0x155/0x3b0 net/ipv6/route.c:6230 nsim_fib6_rt_hw_flags_set drivers/net/netdevsim/fib.c:668 [inline] nsim_fib6_rt_add drivers/net/netdevsim/fib.c:691 [inline] nsim_fib6_rt_insert drivers/net/netdevsim/fib.c:756 [inline] nsim_fib6_event drivers/net/netdevsim/fib.c:853 [inline] nsim_fib_event drivers/net/netdevsim/fib.c:886 [inline] nsim_fib_event_work+0x284f/0x2cf0 drivers/net/netdevsim/fib.c:1477 process_one_work+0x3f6/0x960 kernel/workqueue.c:2307 worker_thread+0x616/0xa70 kernel/workqueue.c:2454 kthread+0x2c7/0x2e0 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 value changed: 0x22 -> 0x2a Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 1912 Comm: kworker/1:3 Not tainted 5.16.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events nsim_fib_event_work Fixes: 0c5fcf9e249e ("IPv6: Add "offload failed" indication to routes") Fixes: bb3c4ab93e44 ("ipv6: Add "offload" and "trap" indications to routes") Signed-off-by: Eric Dumazet Cc: Amit Cohen Cc: Ido Schimmel Reported-by: syzbot Link: https://lore.kernel.org/r/20220216173217.3792411-2-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/netdevsim/fib.c | 4 ++-- include/net/ip6_fib.h | 10 ++++++---- net/ipv6/route.c | 19 ++++++++++--------- 3 files changed, 18 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/net/netdevsim/fib.c b/drivers/net/netdevsim/fib.c index 4300261e2f9e..378ee779061c 100644 --- a/drivers/net/netdevsim/fib.c +++ b/drivers/net/netdevsim/fib.c @@ -623,14 +623,14 @@ static int nsim_fib6_rt_append(struct nsim_fib_data *data, if (err) goto err_fib6_rt_nh_del; - fib6_event->rt_arr[i]->trap = true; + WRITE_ONCE(fib6_event->rt_arr[i]->trap, true); } return 0; err_fib6_rt_nh_del: for (i--; i >= 0; i--) { - fib6_event->rt_arr[i]->trap = false; + WRITE_ONCE(fib6_event->rt_arr[i]->trap, false); nsim_fib6_rt_nh_del(fib6_rt, fib6_event->rt_arr[i]); } return err; diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 40ae8f1b18e5..2048bc8748cb 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -190,14 +190,16 @@ struct fib6_info { u32 fib6_metric; u8 fib6_protocol; u8 fib6_type; + + u8 offload; + u8 trap; + u8 offload_failed; + u8 should_flush:1, dst_nocount:1, dst_nopolicy:1, fib6_destroying:1, - offload:1, - trap:1, - offload_failed:1, - unused:1; + unused:4; struct rcu_head rcu; struct nexthop *nh; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f4884cda13b9..ea1cf414a92e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5753,11 +5753,11 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, } if (!dst) { - if (rt->offload) + if (READ_ONCE(rt->offload)) rtm->rtm_flags |= RTM_F_OFFLOAD; - if (rt->trap) + if (READ_ONCE(rt->trap)) rtm->rtm_flags |= RTM_F_TRAP; - if (rt->offload_failed) + if (READ_ONCE(rt->offload_failed)) rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; } @@ -6215,19 +6215,20 @@ void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, struct sk_buff *skb; int err; - if (f6i->offload == offload && f6i->trap == trap && - f6i->offload_failed == offload_failed) + if (READ_ONCE(f6i->offload) == offload && + READ_ONCE(f6i->trap) == trap && + READ_ONCE(f6i->offload_failed) == offload_failed) return; - f6i->offload = offload; - f6i->trap = trap; + WRITE_ONCE(f6i->offload, offload); + WRITE_ONCE(f6i->trap, trap); /* 2 means send notifications only if offload_failed was changed. */ if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 && - f6i->offload_failed == offload_failed) + READ_ONCE(f6i->offload_failed) == offload_failed) return; - f6i->offload_failed = offload_failed; + WRITE_ONCE(f6i->offload_failed, offload_failed); if (!rcu_access_pointer(f6i->fib6_node)) /* The route was removed from the tree, do not send -- cgit From a1cdec57e03a1352e92fbbe7974039dda4efcec0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 Feb 2022 09:05:02 -0800 Subject: net-timestamp: convert sk->sk_tskey to atomic_t UDP sendmsg() can be lockless, this is causing all kinds of data races. This patch converts sk->sk_tskey to remove one of these races. BUG: KCSAN: data-race in __ip_append_data / __ip_append_data read to 0xffff8881035d4b6c of 4 bytes by task 8877 on cpu 1: __ip_append_data+0x1c1/0x1de0 net/ipv4/ip_output.c:994 ip_make_skb+0x13f/0x2d0 net/ipv4/ip_output.c:1636 udp_sendmsg+0x12bd/0x14c0 net/ipv4/udp.c:1249 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae write to 0xffff8881035d4b6c of 4 bytes by task 8880 on cpu 0: __ip_append_data+0x1d8/0x1de0 net/ipv4/ip_output.c:994 ip_make_skb+0x13f/0x2d0 net/ipv4/ip_output.c:1636 udp_sendmsg+0x12bd/0x14c0 net/ipv4/udp.c:1249 inet_sendmsg+0x5f/0x80 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg net/socket.c:725 [inline] ____sys_sendmsg+0x39a/0x510 net/socket.c:2413 ___sys_sendmsg net/socket.c:2467 [inline] __sys_sendmmsg+0x267/0x4c0 net/socket.c:2553 __do_sys_sendmmsg net/socket.c:2582 [inline] __se_sys_sendmmsg net/socket.c:2579 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2579 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000054d -> 0x0000054e Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 8880 Comm: syz-executor.5 Not tainted 5.17.0-rc2-syzkaller-00167-gdcb85f85fa6f-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 09c2d251b707 ("net-timestamp: add key to disambiguate concurrent datagrams") Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/sock.h | 4 ++-- net/can/j1939/transport.c | 2 +- net/core/skbuff.c | 2 +- net/core/sock.c | 4 ++-- net/ipv4/ip_output.c | 2 +- net/ipv6/ip6_output.c | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index ff9b508d9c5f..50aecd28b355 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -507,7 +507,7 @@ struct sock { #endif u16 sk_tsflags; u8 sk_shutdown; - u32 sk_tskey; + atomic_t sk_tskey; atomic_t sk_zckey; u8 sk_clockid; @@ -2667,7 +2667,7 @@ static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags, __sock_tx_timestamp(tsflags, tx_flags); if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey && tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) - *tskey = sk->sk_tskey++; + *tskey = atomic_inc_return(&sk->sk_tskey) - 1; } if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) *tx_flags |= SKBTX_WIFI_STATUS; diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index a271688780a2..307ee1174a6e 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -2006,7 +2006,7 @@ struct j1939_session *j1939_tp_send(struct j1939_priv *priv, /* set the end-packet for broadcast */ session->pkt.last = session->pkt.total; - skcb->tskey = session->sk->sk_tskey++; + skcb->tskey = atomic_inc_return(&session->sk->sk_tskey) - 1; session->tskey = skcb->tskey; return session; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9d0388bed0c1..6a15ce3eb1d3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4730,7 +4730,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; if (sk_is_tcp(sk)) - serr->ee.ee_data -= sk->sk_tskey; + serr->ee.ee_data -= atomic_read(&sk->sk_tskey); } err = sock_queue_err_skb(sk, skb); diff --git a/net/core/sock.c b/net/core/sock.c index 4ff806d71921..6eb174805bf0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -879,9 +879,9 @@ int sock_set_timestamping(struct sock *sk, int optname, if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return -EINVAL; - sk->sk_tskey = tcp_sk(sk)->snd_una; + atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); } else { - sk->sk_tskey = 0; + atomic_set(&sk->sk_tskey, 0); } } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 139cec29ed06..7911916a480b 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -991,7 +991,7 @@ static int __ip_append_data(struct sock *sk, if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) - tskey = sk->sk_tskey++; + tskey = atomic_inc_return(&sk->sk_tskey) - 1; hh_len = LL_RESERVED_SPACE(rt->dst.dev); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 2995f8d89e7e..304a295de84f 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1465,7 +1465,7 @@ static int __ip6_append_data(struct sock *sk, if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) - tskey = sk->sk_tskey++; + tskey = atomic_inc_return(&sk->sk_tskey) - 1; hh_len = LL_RESERVED_SPACE(rt->dst.dev); -- cgit From b1e8206582f9d680cff7d04828708c8b6ab32957 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Feb 2022 10:16:57 +0100 Subject: sched: Fix yet more sched_fork() races Where commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") fixed a fork race vs cgroup, it opened up a race vs syscalls by not placing the task on the runqueue before it gets exposed through the pidhash. Commit 13765de8148f ("sched/fair: Fix fault in reweight_entity") is trying to fix a single instance of this, instead fix the whole class of issues, effectively reverting this commit. Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") Reported-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tadeusz Struk Tested-by: Zhang Qiao Tested-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/YgoeCbwj5mbCR0qA@hirez.programming.kicks-ass.net --- include/linux/sched/task.h | 4 ++-- kernel/fork.c | 13 ++++++++++++- kernel/sched/core.c | 34 +++++++++++++++++++++------------- 3 files changed, 35 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index b9198a1b3a84..e84e54d1b490 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -54,8 +54,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); -extern void sched_post_fork(struct task_struct *p, - struct kernel_clone_args *kargs); +extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); +extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); diff --git a/kernel/fork.c b/kernel/fork.c index d75a528f7b21..c607d238fc23 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2266,6 +2266,17 @@ static __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_put_pidfd; + /* + * Now that the cgroups are pinned, re-clone the parent cgroup and put + * the new task on the correct runqueue. All this *before* the task + * becomes visible. + * + * This isn't part of ->can_fork() because while the re-cloning is + * cgroup specific, it unconditionally needs to place the task on a + * runqueue. + */ + sched_cgroup_fork(p, args); + /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do @@ -2376,7 +2387,7 @@ static __latent_entropy struct task_struct *copy_process( write_unlock_irq(&tasklist_lock); proc_fork_connector(p); - sched_post_fork(p, args); + sched_post_fork(p); cgroup_post_fork(p, args); perf_event_fork(p); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fcf0c180617c..9745613d531c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1214,9 +1214,8 @@ int tg_nop(struct task_group *tg, void *data) } #endif -static void set_load_weight(struct task_struct *p) +static void set_load_weight(struct task_struct *p, bool update_load) { - bool update_load = !(READ_ONCE(p->__state) & TASK_NEW); int prio = p->static_prio - MAX_RT_PRIO; struct load_weight *load = &p->se.load; @@ -4407,7 +4406,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->static_prio = NICE_TO_PRIO(0); p->prio = p->normal_prio = p->static_prio; - set_load_weight(p); + set_load_weight(p, false); /* * We don't need the reset flag anymore after the fork. It has @@ -4425,6 +4424,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) init_entity_runnable_average(&p->se); + #ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -4440,18 +4440,23 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } -void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) +void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) { unsigned long flags; -#ifdef CONFIG_CGROUP_SCHED - struct task_group *tg; -#endif + /* + * Because we're not yet on the pid-hash, p->pi_lock isn't strictly + * required yet, but lockdep gets upset if rules are violated. + */ raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_CGROUP_SCHED - tg = container_of(kargs->cset->subsys[cpu_cgrp_id], - struct task_group, css); - p->sched_task_group = autogroup_task_group(p, tg); + if (1) { + struct task_group *tg; + tg = container_of(kargs->cset->subsys[cpu_cgrp_id], + struct task_group, css); + tg = autogroup_task_group(p, tg); + p->sched_task_group = tg; + } #endif rseq_migrate(p); /* @@ -4462,7 +4467,10 @@ void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} +void sched_post_fork(struct task_struct *p) +{ uclamp_post_fork(p); } @@ -6922,7 +6930,7 @@ void set_user_nice(struct task_struct *p, long nice) put_prev_task(rq, p); p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p); + set_load_weight(p, true); old_prio = p->prio; p->prio = effective_prio(p); @@ -7213,7 +7221,7 @@ static void __setscheduler_params(struct task_struct *p, */ p->rt_priority = attr->sched_priority; p->normal_prio = normal_prio(p); - set_load_weight(p); + set_load_weight(p, true); } /* @@ -9446,7 +9454,7 @@ void __init sched_init(void) #endif } - set_load_weight(&init_task); + set_load_weight(&init_task, false); /* * The boot idle thread does lazy MMU switching as well: -- cgit From 5486f5bf790b5c664913076c3194b8f916a5c7ad Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 17 Feb 2022 14:35:49 +0100 Subject: net: Force inlining of checksum functions in net/checksum.h All functions defined as static inline in net/checksum.h are meant to be inlined for performance reason. But since commit ac7c3e4ff401 ("compiler: enable CONFIG_OPTIMIZE_INLINING forcibly") the compiler is allowed to uninline functions when it wants. Fair enough in the general case, but for tiny performance critical checksum helpers that's counter-productive. The problem mainly arises when selecting CONFIG_CC_OPTIMISE_FOR_SIZE, Those helpers being 'static inline' in header files you suddenly find them duplicated many times in the resulting vmlinux. Here is a typical exemple when building powerpc pmac32_defconfig with CONFIG_CC_OPTIMISE_FOR_SIZE. csum_sub() appears 4 times: c04a23cc : c04a23cc: 7c 84 20 f8 not r4,r4 c04a23d0: 7c 63 20 14 addc r3,r3,r4 c04a23d4: 7c 63 01 94 addze r3,r3 c04a23d8: 4e 80 00 20 blr ... c04a2ce8: 4b ff f6 e5 bl c04a23cc ... c04a2d2c: 4b ff f6 a1 bl c04a23cc ... c04a2d54: 4b ff f6 79 bl c04a23cc ... c04a754c : c04a754c: 7c 84 20 f8 not r4,r4 c04a7550: 7c 63 20 14 addc r3,r3,r4 c04a7554: 7c 63 01 94 addze r3,r3 c04a7558: 4e 80 00 20 blr ... c04ac930: 4b ff ac 1d bl c04a754c ... c04ad264: 4b ff a2 e9 bl c04a754c ... c04e3b08 : c04e3b08: 7c 84 20 f8 not r4,r4 c04e3b0c: 7c 63 20 14 addc r3,r3,r4 c04e3b10: 7c 63 01 94 addze r3,r3 c04e3b14: 4e 80 00 20 blr ... c04e5788: 4b ff e3 81 bl c04e3b08 ... c04e65c8: 4b ff d5 41 bl c04e3b08 ... c0512d34 : c0512d34: 7c 84 20 f8 not r4,r4 c0512d38: 7c 63 20 14 addc r3,r3,r4 c0512d3c: 7c 63 01 94 addze r3,r3 c0512d40: 4e 80 00 20 blr ... c0512dfc: 4b ff ff 39 bl c0512d34 ... c05138bc: 4b ff f4 79 bl c0512d34 ... Restore the expected behaviour by using __always_inline for all functions defined in net/checksum.h vmlinux size is even reduced by 256 bytes with this patch: text data bss dec hex filename 6980022 2515362 194384 9689768 93daa8 vmlinux.before 6979862 2515266 194384 9689512 93d9a8 vmlinux.now Fixes: ac7c3e4ff401 ("compiler: enable CONFIG_OPTIMIZE_INLINING forcibly") Cc: Masahiro Yamada Cc: Nick Desaulniers Cc: Andrew Morton Signed-off-by: Christophe Leroy Signed-off-by: David S. Miller --- include/net/checksum.h | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/net/checksum.h b/include/net/checksum.h index 5218041e5c8f..02d0c2d01014 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -22,7 +22,7 @@ #include #ifndef _HAVE_ARCH_COPY_AND_CSUM_FROM_USER -static inline +static __always_inline __wsum csum_and_copy_from_user (const void __user *src, void *dst, int len) { @@ -33,7 +33,7 @@ __wsum csum_and_copy_from_user (const void __user *src, void *dst, #endif #ifndef HAVE_CSUM_COPY_USER -static __inline__ __wsum csum_and_copy_to_user +static __always_inline __wsum csum_and_copy_to_user (const void *src, void __user *dst, int len) { __wsum sum = csum_partial(src, len, ~0U); @@ -45,7 +45,7 @@ static __inline__ __wsum csum_and_copy_to_user #endif #ifndef _HAVE_ARCH_CSUM_AND_COPY -static inline __wsum +static __always_inline __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len) { memcpy(dst, src, len); @@ -54,7 +54,7 @@ csum_partial_copy_nocheck(const void *src, void *dst, int len) #endif #ifndef HAVE_ARCH_CSUM_ADD -static inline __wsum csum_add(__wsum csum, __wsum addend) +static __always_inline __wsum csum_add(__wsum csum, __wsum addend) { u32 res = (__force u32)csum; res += (__force u32)addend; @@ -62,12 +62,12 @@ static inline __wsum csum_add(__wsum csum, __wsum addend) } #endif -static inline __wsum csum_sub(__wsum csum, __wsum addend) +static __always_inline __wsum csum_sub(__wsum csum, __wsum addend) { return csum_add(csum, ~addend); } -static inline __sum16 csum16_add(__sum16 csum, __be16 addend) +static __always_inline __sum16 csum16_add(__sum16 csum, __be16 addend) { u16 res = (__force u16)csum; @@ -75,12 +75,12 @@ static inline __sum16 csum16_add(__sum16 csum, __be16 addend) return (__force __sum16)(res + (res < (__force u16)addend)); } -static inline __sum16 csum16_sub(__sum16 csum, __be16 addend) +static __always_inline __sum16 csum16_sub(__sum16 csum, __be16 addend) { return csum16_add(csum, ~addend); } -static inline __wsum csum_shift(__wsum sum, int offset) +static __always_inline __wsum csum_shift(__wsum sum, int offset) { /* rotate sum to align it with a 16b boundary */ if (offset & 1) @@ -88,42 +88,43 @@ static inline __wsum csum_shift(__wsum sum, int offset) return sum; } -static inline __wsum +static __always_inline __wsum csum_block_add(__wsum csum, __wsum csum2, int offset) { return csum_add(csum, csum_shift(csum2, offset)); } -static inline __wsum +static __always_inline __wsum csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len) { return csum_block_add(csum, csum2, offset); } -static inline __wsum +static __always_inline __wsum csum_block_sub(__wsum csum, __wsum csum2, int offset) { return csum_block_add(csum, ~csum2, offset); } -static inline __wsum csum_unfold(__sum16 n) +static __always_inline __wsum csum_unfold(__sum16 n) { return (__force __wsum)n; } -static inline __wsum csum_partial_ext(const void *buff, int len, __wsum sum) +static __always_inline +__wsum csum_partial_ext(const void *buff, int len, __wsum sum) { return csum_partial(buff, len, sum); } #define CSUM_MANGLED_0 ((__force __sum16)0xffff) -static inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) +static __always_inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) { *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); } -static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) +static __always_inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) { __wsum tmp = csum_sub(~csum_unfold(*sum), (__force __wsum)from); @@ -136,7 +137,7 @@ static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) * m : old value of a 16bit field * m' : new value of a 16bit field */ -static inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) +static __always_inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) { *sum = ~csum16_add(csum16_sub(~(*sum), old), new); } @@ -150,16 +151,16 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, __wsum diff, bool pseudohdr); -static inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, - __be16 from, __be16 to, - bool pseudohdr) +static __always_inline +void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, + __be16 from, __be16 to, bool pseudohdr) { inet_proto_csum_replace4(sum, skb, (__force __be32)from, (__force __be32)to, pseudohdr); } -static inline __wsum remcsum_adjust(void *ptr, __wsum csum, - int start, int offset) +static __always_inline __wsum remcsum_adjust(void *ptr, __wsum csum, + int start, int offset) { __sum16 *psum = (__sum16 *)(ptr + offset); __wsum delta; @@ -175,12 +176,12 @@ static inline __wsum remcsum_adjust(void *ptr, __wsum csum, return delta; } -static inline void remcsum_unadjust(__sum16 *psum, __wsum delta) +static __always_inline void remcsum_unadjust(__sum16 *psum, __wsum delta) { *psum = csum_fold(csum_sub(delta, (__force __wsum)*psum)); } -static inline __wsum wsum_negate(__wsum val) +static __always_inline __wsum wsum_negate(__wsum val) { return (__force __wsum)-((__force u32)val); } -- cgit From b1a5983f56e371046dcf164f90bfaf704d2b89f6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 17 Feb 2022 23:41:20 +0100 Subject: netfilter: nf_tables_offload: incorrect flow offload action array size immediate verdict expression needs to allocate one slot in the flow offload action array, however, immediate data expression does not need to do so. fwd and dup expression need to allocate one slot, this is missing. Add a new offload_action interface to report if this expression needs to allocate one slot in the flow offload action array. Fixes: be2861dc36d7 ("netfilter: nft_{fwd,dup}_netdev: add offload support") Reported-and-tested-by: Nick Gregory Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- include/net/netfilter/nf_tables_offload.h | 2 -- net/netfilter/nf_tables_offload.c | 3 ++- net/netfilter/nft_dup_netdev.c | 6 ++++++ net/netfilter/nft_fwd_netdev.c | 6 ++++++ net/netfilter/nft_immediate.c | 12 +++++++++++- 6 files changed, 26 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index eaf55da9a205..c4c0861deac1 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -905,9 +905,9 @@ struct nft_expr_ops { int (*offload)(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr); + bool (*offload_action)(const struct nft_expr *expr); void (*offload_stats)(struct nft_expr *expr, const struct flow_stats *stats); - u32 offload_flags; const struct nft_expr_type *type; void *data; }; diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h index f9d95ff82df8..797147843958 100644 --- a/include/net/netfilter/nf_tables_offload.h +++ b/include/net/netfilter/nf_tables_offload.h @@ -67,8 +67,6 @@ struct nft_flow_rule { struct flow_rule *rule; }; -#define NFT_OFFLOAD_F_ACTION (1 << 0) - void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, enum flow_dissector_key_id addr_type); diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 9656c1646222..2d36952b1392 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -94,7 +94,8 @@ struct nft_flow_rule *nft_flow_rule_create(struct net *net, expr = nft_expr_first(rule); while (nft_expr_more(rule, expr)) { - if (expr->ops->offload_flags & NFT_OFFLOAD_F_ACTION) + if (expr->ops->offload_action && + expr->ops->offload_action(expr)) num_actions++; expr = nft_expr_next(expr); diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c index bbf3fcba3df4..5b5c607fbf83 100644 --- a/net/netfilter/nft_dup_netdev.c +++ b/net/netfilter/nft_dup_netdev.c @@ -67,6 +67,11 @@ static int nft_dup_netdev_offload(struct nft_offload_ctx *ctx, return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_MIRRED, oif); } +static bool nft_dup_netdev_offload_action(const struct nft_expr *expr) +{ + return true; +} + static struct nft_expr_type nft_dup_netdev_type; static const struct nft_expr_ops nft_dup_netdev_ops = { .type = &nft_dup_netdev_type, @@ -75,6 +80,7 @@ static const struct nft_expr_ops nft_dup_netdev_ops = { .init = nft_dup_netdev_init, .dump = nft_dup_netdev_dump, .offload = nft_dup_netdev_offload, + .offload_action = nft_dup_netdev_offload_action, }; static struct nft_expr_type nft_dup_netdev_type __read_mostly = { diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index fa9301ca6033..619e394a91de 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -79,6 +79,11 @@ static int nft_fwd_netdev_offload(struct nft_offload_ctx *ctx, return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_REDIRECT, oif); } +static bool nft_fwd_netdev_offload_action(const struct nft_expr *expr) +{ + return true; +} + struct nft_fwd_neigh { u8 sreg_dev; u8 sreg_addr; @@ -222,6 +227,7 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = { .dump = nft_fwd_netdev_dump, .validate = nft_fwd_validate, .offload = nft_fwd_netdev_offload, + .offload_action = nft_fwd_netdev_offload_action, }; static const struct nft_expr_ops * diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 90c64d27ae53..d0f67d325bdf 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -213,6 +213,16 @@ static int nft_immediate_offload(struct nft_offload_ctx *ctx, return 0; } +static bool nft_immediate_offload_action(const struct nft_expr *expr) +{ + const struct nft_immediate_expr *priv = nft_expr_priv(expr); + + if (priv->dreg == NFT_REG_VERDICT) + return true; + + return false; +} + static const struct nft_expr_ops nft_imm_ops = { .type = &nft_imm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), @@ -224,7 +234,7 @@ static const struct nft_expr_ops nft_imm_ops = { .dump = nft_immediate_dump, .validate = nft_immediate_validate, .offload = nft_immediate_offload, - .offload_flags = NFT_OFFLOAD_F_ACTION, + .offload_action = nft_immediate_offload_action, }; struct nft_expr_type nft_imm_type __read_mostly = { -- cgit From 93dd04ab0b2b32ae6e70284afc764c577156658e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 18 Feb 2022 14:13:58 +0100 Subject: slab: remove __alloc_size attribute from __kmalloc_track_caller Commit c37495d6254c ("slab: add __alloc_size attributes for better bounds checking") added __alloc_size attributes to a bunch of kmalloc function prototypes. Unfortunately the change to __kmalloc_track_caller seems to cause clang to generate broken code and the first time this is called when booting, the box will crash. While the compiler problems are being reworked and attempted to be solved [1], let's just drop the attribute to solve the issue now. Once it is resolved it can be added back. [1] https://github.com/ClangBuiltLinux/linux/issues/1599 Fixes: c37495d6254c ("slab: add __alloc_size attributes for better bounds checking") Cc: stable Cc: Kees Cook Cc: Daniel Micay Cc: Nick Desaulniers Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Andrew Morton Cc: Vlastimil Babka Cc: Nathan Chancellor Cc: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org Cc: llvm@lists.linux.dev Signed-off-by: Greg Kroah-Hartman Acked-by: Nick Desaulniers Acked-by: David Rientjes Acked-by: Kees Cook Signed-off-by: Vlastimil Babka Link: https://lore.kernel.org/r/20220218131358.3032912-1-gregkh@linuxfoundation.org --- include/linux/slab.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 37bde99b74af..5b6193fd8bd9 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -660,8 +660,7 @@ static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flag * allocator where we care about the real place the memory allocation * request comes from. */ -extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) - __alloc_size(1); +extern void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller); #define kmalloc_track_caller(size, flags) \ __kmalloc_track_caller(size, flags, _RET_IP_) -- cgit From f6c052afe6f802d87c74153b7a57c43b2e9faf07 Mon Sep 17 00:00:00 2001 From: Christophe Kerello Date: Sun, 20 Feb 2022 15:14:31 +0000 Subject: nvmem: core: Fix a conflict between MTD and NVMEM on wp-gpios property Wp-gpios property can be used on NVMEM nodes and the same property can be also used on MTD NAND nodes. In case of the wp-gpios property is defined at NAND level node, the GPIO management is done at NAND driver level. Write protect is disabled when the driver is probed or resumed and is enabled when the driver is released or suspended. When no partitions are defined in the NAND DT node, then the NAND DT node will be passed to NVMEM framework. If wp-gpios property is defined in this node, the GPIO resource is taken twice and the NAND controller driver fails to probe. It would be possible to set config->wp_gpio at MTD level before calling nvmem_register function but NVMEM framework will toggle this GPIO on each write when this GPIO should only be controlled at NAND level driver to ensure that the Write Protect has not been enabled. A way to fix this conflict is to add a new boolean flag in nvmem_config named ignore_wp. In case ignore_wp is set, the GPIO resource will be managed by the provider. Fixes: 2a127da461a9 ("nvmem: add support for the write-protect pin") Cc: stable@vger.kernel.org Signed-off-by: Christophe Kerello Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20220220151432.16605-2-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/nvmem/core.c | 2 +- include/linux/nvmem-provider.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index 23a38dcf0fc4..9fd1602b539d 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -771,7 +771,7 @@ struct nvmem_device *nvmem_register(const struct nvmem_config *config) if (config->wp_gpio) nvmem->wp_gpio = config->wp_gpio; - else + else if (!config->ignore_wp) nvmem->wp_gpio = gpiod_get_optional(config->dev, "wp", GPIOD_OUT_HIGH); if (IS_ERR(nvmem->wp_gpio)) { diff --git a/include/linux/nvmem-provider.h b/include/linux/nvmem-provider.h index 98efb7b5660d..c9a3ac9efeaa 100644 --- a/include/linux/nvmem-provider.h +++ b/include/linux/nvmem-provider.h @@ -70,7 +70,8 @@ struct nvmem_keepout { * @word_size: Minimum read/write access granularity. * @stride: Minimum read/write access stride. * @priv: User context passed to read/write callbacks. - * @wp-gpio: Write protect pin + * @wp-gpio: Write protect pin + * @ignore_wp: Write Protect pin is managed by the provider. * * Note: A default "nvmem" name will be assigned to the device if * no name is specified in its configuration. In such case "" is @@ -92,6 +93,7 @@ struct nvmem_config { enum nvmem_type type; bool read_only; bool root_only; + bool ignore_wp; struct device_node *of_node; bool no_of_node; nvmem_reg_read_t reg_read; -- cgit From 93b71801a8274cd9511557faf04365a5de487197 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 22 Feb 2022 09:06:54 -0500 Subject: KVM: PPC: reserve capability 210 for KVM_CAP_PPC_AIL_MODE_3 Add KVM_CAP_PPC_AIL_MODE_3 to advertise the capability to set the AIL resource mode to 3 with the H_SET_MODE hypercall. This capability differs between processor types and KVM types (PR, HV, Nested HV), and affects guest-visible behaviour. QEMU will implement a cap-ail-mode-3 to control this behaviour[1], and use the KVM CAP if available to determine KVM support[2]. Reviewed-by: Fabiano Rosas Signed-off-by: Nicholas Piggin Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 14 ++++++++++++++ include/uapi/linux/kvm.h | 1 + tools/include/uapi/linux/kvm.h | 1 + 3 files changed, 16 insertions(+) (limited to 'include') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index a4267104db50..9954568c7eab 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6997,6 +6997,20 @@ indicated by the fd to the VM this is called on. This is intended to support intra-host migration of VMs between userspace VMMs, upgrading the VMM process without interrupting the guest. +7.30 KVM_CAP_PPC_AIL_MODE_3 +------------------------------- + +:Capability: KVM_CAP_PPC_AIL_MODE_3 +:Architectures: ppc +:Type: vm + +This capability indicates that the kernel supports the mode 3 setting for the +"Address Translation Mode on Interrupt" aka "Alternate Interrupt Location" +resource that is controlled with the H_SET_MODE hypercall. + +This capability allows a guest kernel to use a better-performance mode for +handling interrupts and system calls. + 8. Other capabilities. ====================== diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 5191b57e1562..507ee1f2aa96 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1134,6 +1134,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_GPA_BITS 207 #define KVM_CAP_XSAVE2 208 #define KVM_CAP_SYS_ATTRIBUTES 209 +#define KVM_CAP_PPC_AIL_MODE_3 210 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 5191b57e1562..507ee1f2aa96 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1134,6 +1134,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_GPA_BITS 207 #define KVM_CAP_XSAVE2 208 #define KVM_CAP_SYS_ATTRIBUTES 209 +#define KVM_CAP_PPC_AIL_MODE_3 210 #ifdef KVM_CAP_IRQ_ROUTING -- cgit From c2700d2886a87f83f31e0a301de1d2350b52c79b Mon Sep 17 00:00:00 2001 From: Varun Prakash Date: Sat, 22 Jan 2022 22:27:44 +0530 Subject: nvme-tcp: send H2CData PDUs based on MAXH2CDATA As per NVMe/TCP specification (revision 1.0a, section 3.6.2.3) Maximum Host to Controller Data length (MAXH2CDATA): Specifies the maximum number of PDU-Data bytes per H2CData PDU in bytes. This value is a multiple of dwords and should be no less than 4,096. Current code sets H2CData PDU data_length to r2t_length, it does not check MAXH2CDATA value. Fix this by setting H2CData PDU data_length to min(req->h2cdata_left, queue->maxh2cdata). Also validate MAXH2CDATA value returned by target in ICResp PDU, if it is not a multiple of dword or if it is less than 4096 return -EINVAL from nvme_tcp_init_connection(). Signed-off-by: Varun Prakash Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 63 +++++++++++++++++++++++++++++++++++++----------- include/linux/nvme-tcp.h | 1 + 2 files changed, 50 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 891a36d02e7c..65e00c64a588 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -44,6 +44,8 @@ struct nvme_tcp_request { u32 data_len; u32 pdu_len; u32 pdu_sent; + u32 h2cdata_left; + u32 h2cdata_offset; u16 ttag; __le16 status; struct list_head entry; @@ -95,6 +97,7 @@ struct nvme_tcp_queue { struct nvme_tcp_request *request; int queue_size; + u32 maxh2cdata; size_t cmnd_capsule_len; struct nvme_tcp_ctrl *ctrl; unsigned long flags; @@ -572,23 +575,26 @@ static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue, return ret; } -static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req, - struct nvme_tcp_r2t_pdu *pdu) +static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req) { struct nvme_tcp_data_pdu *data = req->pdu; struct nvme_tcp_queue *queue = req->queue; struct request *rq = blk_mq_rq_from_pdu(req); + u32 h2cdata_sent = req->pdu_len; u8 hdgst = nvme_tcp_hdgst_len(queue); u8 ddgst = nvme_tcp_ddgst_len(queue); req->state = NVME_TCP_SEND_H2C_PDU; req->offset = 0; - req->pdu_len = le32_to_cpu(pdu->r2t_length); + req->pdu_len = min(req->h2cdata_left, queue->maxh2cdata); req->pdu_sent = 0; + req->h2cdata_left -= req->pdu_len; + req->h2cdata_offset += h2cdata_sent; memset(data, 0, sizeof(*data)); data->hdr.type = nvme_tcp_h2c_data; - data->hdr.flags = NVME_TCP_F_DATA_LAST; + if (!req->h2cdata_left) + data->hdr.flags = NVME_TCP_F_DATA_LAST; if (queue->hdr_digest) data->hdr.flags |= NVME_TCP_F_HDGST; if (queue->data_digest) @@ -597,9 +603,9 @@ static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req, data->hdr.pdo = data->hdr.hlen + hdgst; data->hdr.plen = cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst); - data->ttag = pdu->ttag; + data->ttag = req->ttag; data->command_id = nvme_cid(rq); - data->data_offset = pdu->r2t_offset; + data->data_offset = cpu_to_le32(req->h2cdata_offset); data->data_length = cpu_to_le32(req->pdu_len); } @@ -609,6 +615,7 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, struct nvme_tcp_request *req; struct request *rq; u32 r2t_length = le32_to_cpu(pdu->r2t_length); + u32 r2t_offset = le32_to_cpu(pdu->r2t_offset); rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id); if (!rq) { @@ -633,14 +640,19 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, return -EPROTO; } - if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) { + if (unlikely(r2t_offset < req->data_sent)) { dev_err(queue->ctrl->ctrl.device, "req %d unexpected r2t offset %u (expected %zu)\n", - rq->tag, le32_to_cpu(pdu->r2t_offset), req->data_sent); + rq->tag, r2t_offset, req->data_sent); return -EPROTO; } - nvme_tcp_setup_h2c_data_pdu(req, pdu); + req->pdu_len = 0; + req->h2cdata_left = r2t_length; + req->h2cdata_offset = r2t_offset; + req->ttag = pdu->ttag; + + nvme_tcp_setup_h2c_data_pdu(req); nvme_tcp_queue_request(req, false, true); return 0; @@ -928,6 +940,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; int req_data_len = req->data_len; + u32 h2cdata_left = req->h2cdata_left; while (true) { struct page *page = nvme_tcp_req_cur_page(req); @@ -972,7 +985,10 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) req->state = NVME_TCP_SEND_DDGST; req->offset = 0; } else { - nvme_tcp_done_send_req(queue); + if (h2cdata_left) + nvme_tcp_setup_h2c_data_pdu(req); + else + nvme_tcp_done_send_req(queue); } return 1; } @@ -1030,9 +1046,14 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) if (queue->hdr_digest && !req->offset) nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); - ret = kernel_sendpage(queue->sock, virt_to_page(pdu), - offset_in_page(pdu) + req->offset, len, - MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST); + if (!req->h2cdata_left) + ret = kernel_sendpage(queue->sock, virt_to_page(pdu), + offset_in_page(pdu) + req->offset, len, + MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST); + else + ret = sock_no_sendpage(queue->sock, virt_to_page(pdu), + offset_in_page(pdu) + req->offset, len, + MSG_DONTWAIT | MSG_MORE); if (unlikely(ret <= 0)) return ret; @@ -1052,6 +1073,7 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; size_t offset = req->offset; + u32 h2cdata_left = req->h2cdata_left; int ret; struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; struct kvec iov = { @@ -1069,7 +1091,10 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) return ret; if (offset + ret == NVME_TCP_DIGEST_LENGTH) { - nvme_tcp_done_send_req(queue); + if (h2cdata_left) + nvme_tcp_setup_h2c_data_pdu(req); + else + nvme_tcp_done_send_req(queue); return 1; } @@ -1261,6 +1286,7 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) struct msghdr msg = {}; struct kvec iov; bool ctrl_hdgst, ctrl_ddgst; + u32 maxh2cdata; int ret; icreq = kzalloc(sizeof(*icreq), GFP_KERNEL); @@ -1344,6 +1370,14 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) goto free_icresp; } + maxh2cdata = le32_to_cpu(icresp->maxdata); + if ((maxh2cdata % 4) || (maxh2cdata < NVME_TCP_MIN_MAXH2CDATA)) { + pr_err("queue %d: invalid maxh2cdata returned %u\n", + nvme_tcp_queue_id(queue), maxh2cdata); + goto free_icresp; + } + queue->maxh2cdata = maxh2cdata; + ret = 0; free_icresp: kfree(icresp); @@ -2329,6 +2363,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, req->data_sent = 0; req->pdu_len = 0; req->pdu_sent = 0; + req->h2cdata_left = 0; req->data_len = blk_rq_nr_phys_segments(rq) ? blk_rq_payload_bytes(rq) : 0; req->curr_bio = rq->bio; diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h index 959e0bd9a913..75470159a194 100644 --- a/include/linux/nvme-tcp.h +++ b/include/linux/nvme-tcp.h @@ -12,6 +12,7 @@ #define NVME_TCP_DISC_PORT 8009 #define NVME_TCP_ADMIN_CCSZ SZ_8K #define NVME_TCP_DIGEST_LENGTH 4 +#define NVME_TCP_MIN_MAXH2CDATA 4096 enum nvme_tcp_pfv { NVME_TCP_PFV_1_0 = 0x0, -- cgit From d9b5ae5c1b241b91480aa30408be12fe91af834a Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Wed, 23 Feb 2022 18:34:16 +0200 Subject: openvswitch: Fix setting ipv6 fields causing hw csum failure Ipv6 ttl, label and tos fields are modified without first pulling/pushing the ipv6 header, which would have updated the hw csum (if available). This might cause csum validation when sending the packet to the stack, as can be seen in the trace below. Fix this by updating skb->csum if available. Trace resulted by ipv6 ttl dec and then sending packet to conntrack [actions: set(ipv6(hlimit=63)),ct(zone=99)]: [295241.900063] s_pf0vf2: hw csum failure [295241.923191] Call Trace: [295241.925728] [295241.927836] dump_stack+0x5c/0x80 [295241.931240] __skb_checksum_complete+0xac/0xc0 [295241.935778] nf_conntrack_tcp_packet+0x398/0xba0 [nf_conntrack] [295241.953030] nf_conntrack_in+0x498/0x5e0 [nf_conntrack] [295241.958344] __ovs_ct_lookup+0xac/0x860 [openvswitch] [295241.968532] ovs_ct_execute+0x4a7/0x7c0 [openvswitch] [295241.979167] do_execute_actions+0x54a/0xaa0 [openvswitch] [295242.001482] ovs_execute_actions+0x48/0x100 [openvswitch] [295242.006966] ovs_dp_process_packet+0x96/0x1d0 [openvswitch] [295242.012626] ovs_vport_receive+0x6c/0xc0 [openvswitch] [295242.028763] netdev_frame_hook+0xc0/0x180 [openvswitch] [295242.034074] __netif_receive_skb_core+0x2ca/0xcb0 [295242.047498] netif_receive_skb_internal+0x3e/0xc0 [295242.052291] napi_gro_receive+0xba/0xe0 [295242.056231] mlx5e_handle_rx_cqe_mpwrq_rep+0x12b/0x250 [mlx5_core] [295242.062513] mlx5e_poll_rx_cq+0xa0f/0xa30 [mlx5_core] [295242.067669] mlx5e_napi_poll+0xe1/0x6b0 [mlx5_core] [295242.077958] net_rx_action+0x149/0x3b0 [295242.086762] __do_softirq+0xd7/0x2d6 [295242.090427] irq_exit+0xf7/0x100 [295242.093748] do_IRQ+0x7f/0xd0 [295242.096806] common_interrupt+0xf/0xf [295242.100559] [295242.102750] RIP: 0033:0x7f9022e88cbd [295242.125246] RSP: 002b:00007f9022282b20 EFLAGS: 00000246 ORIG_RAX: ffffffffffffffda [295242.132900] RAX: 0000000000000005 RBX: 0000000000000010 RCX: 0000000000000000 [295242.140120] RDX: 00007f9022282ba8 RSI: 00007f9022282a30 RDI: 00007f9014005c30 [295242.147337] RBP: 00007f9014014d60 R08: 0000000000000020 R09: 00007f90254a8340 [295242.154557] R10: 00007f9022282a28 R11: 0000000000000246 R12: 0000000000000000 [295242.161775] R13: 00007f902308c000 R14: 000000000000002b R15: 00007f9022b71f40 Fixes: 3fdbd1ce11e5 ("openvswitch: add ipv6 'set' action") Signed-off-by: Paul Blakey Link: https://lore.kernel.org/r/20220223163416.24096-1-paulb@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/checksum.h | 5 +++++ net/openvswitch/actions.c | 46 ++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 43 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/checksum.h b/include/net/checksum.h index 02d0c2d01014..79c67f14c448 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -142,6 +142,11 @@ static __always_inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) *sum = ~csum16_add(csum16_sub(~(*sum), old), new); } +static inline void csum_replace(__wsum *csum, __wsum old, __wsum new) +{ + *csum = csum_add(csum_sub(*csum, old), new); +} + struct sk_buff; void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, bool pseudohdr); diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 076774034bb9..780d9e2246f3 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -423,12 +423,43 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, memcpy(addr, new_addr, sizeof(__be32[4])); } -static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask) +static void set_ipv6_dsfield(struct sk_buff *skb, struct ipv6hdr *nh, u8 ipv6_tclass, u8 mask) { + u8 old_ipv6_tclass = ipv6_get_dsfield(nh); + + ipv6_tclass = OVS_MASKED(old_ipv6_tclass, ipv6_tclass, mask); + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum_replace(&skb->csum, (__force __wsum)(old_ipv6_tclass << 12), + (__force __wsum)(ipv6_tclass << 12)); + + ipv6_change_dsfield(nh, ~mask, ipv6_tclass); +} + +static void set_ipv6_fl(struct sk_buff *skb, struct ipv6hdr *nh, u32 fl, u32 mask) +{ + u32 ofl; + + ofl = nh->flow_lbl[0] << 16 | nh->flow_lbl[1] << 8 | nh->flow_lbl[2]; + fl = OVS_MASKED(ofl, fl, mask); + /* Bits 21-24 are always unmasked, so this retains their values. */ - OVS_SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16)); - OVS_SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8)); - OVS_SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask); + nh->flow_lbl[0] = (u8)(fl >> 16); + nh->flow_lbl[1] = (u8)(fl >> 8); + nh->flow_lbl[2] = (u8)fl; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum_replace(&skb->csum, (__force __wsum)htonl(ofl), (__force __wsum)htonl(fl)); +} + +static void set_ipv6_ttl(struct sk_buff *skb, struct ipv6hdr *nh, u8 new_ttl, u8 mask) +{ + new_ttl = OVS_MASKED(nh->hop_limit, new_ttl, mask); + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum_replace(&skb->csum, (__force __wsum)(nh->hop_limit << 8), + (__force __wsum)(new_ttl << 8)); + nh->hop_limit = new_ttl; } static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl, @@ -546,18 +577,17 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, } } if (mask->ipv6_tclass) { - ipv6_change_dsfield(nh, ~mask->ipv6_tclass, key->ipv6_tclass); + set_ipv6_dsfield(skb, nh, key->ipv6_tclass, mask->ipv6_tclass); flow_key->ip.tos = ipv6_get_dsfield(nh); } if (mask->ipv6_label) { - set_ipv6_fl(nh, ntohl(key->ipv6_label), + set_ipv6_fl(skb, nh, ntohl(key->ipv6_label), ntohl(mask->ipv6_label)); flow_key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); } if (mask->ipv6_hlimit) { - OVS_SET_MASKED(nh->hop_limit, key->ipv6_hlimit, - mask->ipv6_hlimit); + set_ipv6_ttl(skb, nh, key->ipv6_hlimit, mask->ipv6_hlimit); flow_key->ip.ttl = nh->hop_limit; } return 0; -- cgit From bc82c38a6933aab308387d4aca47e0a05de7b553 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 11 Feb 2022 08:10:18 +0100 Subject: tracing: Uninline trace_trigger_soft_disabled() partly On a powerpc32 build with CONFIG_CC_OPTIMISE_FOR_SIZE, the inline keyword is not honored and trace_trigger_soft_disabled() appears approx 50 times in vmlinux. Adding -Winline to the build, the following message appears: ./include/linux/trace_events.h:712:1: error: inlining failed in call to 'trace_trigger_soft_disabled': call is unlikely and code size would grow [-Werror=inline] That function is rather big for an inlined function: c003df60 : c003df60: 94 21 ff f0 stwu r1,-16(r1) c003df64: 7c 08 02 a6 mflr r0 c003df68: 90 01 00 14 stw r0,20(r1) c003df6c: bf c1 00 08 stmw r30,8(r1) c003df70: 83 e3 00 24 lwz r31,36(r3) c003df74: 73 e9 01 00 andi. r9,r31,256 c003df78: 41 82 00 10 beq c003df88 c003df7c: 38 60 00 00 li r3,0 c003df80: 39 61 00 10 addi r11,r1,16 c003df84: 4b fd 60 ac b c0014030 <_rest32gpr_30_x> c003df88: 73 e9 00 80 andi. r9,r31,128 c003df8c: 7c 7e 1b 78 mr r30,r3 c003df90: 41 a2 00 14 beq c003dfa4 c003df94: 38 c0 00 00 li r6,0 c003df98: 38 a0 00 00 li r5,0 c003df9c: 38 80 00 00 li r4,0 c003dfa0: 48 05 c5 f1 bl c009a590 c003dfa4: 73 e9 00 40 andi. r9,r31,64 c003dfa8: 40 82 00 28 bne c003dfd0 c003dfac: 73 ff 02 00 andi. r31,r31,512 c003dfb0: 41 82 ff cc beq c003df7c c003dfb4: 80 01 00 14 lwz r0,20(r1) c003dfb8: 83 e1 00 0c lwz r31,12(r1) c003dfbc: 7f c3 f3 78 mr r3,r30 c003dfc0: 83 c1 00 08 lwz r30,8(r1) c003dfc4: 7c 08 03 a6 mtlr r0 c003dfc8: 38 21 00 10 addi r1,r1,16 c003dfcc: 48 05 6f 6c b c0094f38 c003dfd0: 38 60 00 01 li r3,1 c003dfd4: 4b ff ff ac b c003df80 However it is located in a hot path so inlining it is important. But forcing inlining of the entire function by using __always_inline leads to increasing the text size by approx 20 kbytes. Instead, split the fonction in two parts, one part with the likely fast path, flagged __always_inline, and a second part out of line. With this change, on a powerpc32 with CONFIG_CC_OPTIMISE_FOR_SIZE vmlinux text increases by only 1,4 kbytes, which is partly compensated by a decrease of vmlinux data by 7 kbytes. On ppc64_defconfig which has CONFIG_CC_OPTIMISE_FOR_SPEED, this change reduces vmlinux text by more than 30 kbytes. Link: https://lkml.kernel.org/r/69ce0986a52d026d381d612801d978aa4f977460.1644563295.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 22 ++++++++++++---------- kernel/trace/trace_events_trigger.c | 14 ++++++++++++++ 2 files changed, 26 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 70c069aef02c..dcea51fb60e2 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -699,6 +699,8 @@ event_triggers_post_call(struct trace_event_file *file, bool trace_event_ignore_this_pid(struct trace_event_file *trace_file); +bool __trace_trigger_soft_disabled(struct trace_event_file *file); + /** * trace_trigger_soft_disabled - do triggers and test if soft disabled * @file: The file pointer of the event to test @@ -708,20 +710,20 @@ bool trace_event_ignore_this_pid(struct trace_event_file *trace_file); * triggers that require testing the fields, it will return true, * otherwise false. */ -static inline bool +static __always_inline bool trace_trigger_soft_disabled(struct trace_event_file *file) { unsigned long eflags = file->flags; - if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) { - if (eflags & EVENT_FILE_FL_TRIGGER_MODE) - event_triggers_call(file, NULL, NULL, NULL); - if (eflags & EVENT_FILE_FL_SOFT_DISABLED) - return true; - if (eflags & EVENT_FILE_FL_PID_FILTER) - return trace_event_ignore_this_pid(file); - } - return false; + if (likely(!(eflags & (EVENT_FILE_FL_TRIGGER_MODE | + EVENT_FILE_FL_SOFT_DISABLED | + EVENT_FILE_FL_PID_FILTER)))) + return false; + + if (likely(eflags & EVENT_FILE_FL_TRIGGER_COND)) + return false; + + return __trace_trigger_soft_disabled(file); } #ifdef CONFIG_BPF_EVENTS diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index efe563140f27..7eb9d04f1c2e 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -84,6 +84,20 @@ event_triggers_call(struct trace_event_file *file, } EXPORT_SYMBOL_GPL(event_triggers_call); +bool __trace_trigger_soft_disabled(struct trace_event_file *file) +{ + unsigned long eflags = file->flags; + + if (eflags & EVENT_FILE_FL_TRIGGER_MODE) + event_triggers_call(file, NULL, NULL, NULL); + if (eflags & EVENT_FILE_FL_SOFT_DISABLED) + return true; + if (eflags & EVENT_FILE_FL_PID_FILTER) + return trace_event_ignore_this_pid(file); + return false; +} +EXPORT_SYMBOL_GPL(__trace_trigger_soft_disabled); + /** * event_triggers_post_call - Call 'post_triggers' for a trace event * @file: The trace_event_file associated with the event -- cgit