From a6828214480e2f00a8a7e64c7a55fc42b0f54e1c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 5 Sep 2023 17:49:35 -0400 Subject: workqueue: Removed double allocation of wq_update_pod_attrs_buf First commit 2930155b2e272 ("workqueue: Initialize unbound CPU pods later in the boot") added the initialization of wq_update_pod_attrs_buf to workqueue_init_early(), and then latter on, commit 84193c07105c6 ("workqueue: Generalize unbound CPU pods") added it as well. This appeared in a kmemleak run where the second allocation made the first allocation leak. Fixes: 84193c07105c6 ("workqueue: Generalize unbound CPU pods") Signed-off-by: Steven Rostedt (Google) Reviewed-by: Geert Uytterhoeven Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c85825e17df8..129328b765fb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -6535,9 +6535,6 @@ void __init workqueue_init_early(void) BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE)); - wq_update_pod_attrs_buf = alloc_workqueue_attrs(); - BUG_ON(!wq_update_pod_attrs_buf); - pt->nr_pods = 1; cpumask_copy(pt->pod_cpus[0], cpu_possible_mask); pt->pod_node[0] = NUMA_NO_NODE; -- cgit From dd64c873ed11cdae340be06dcd2364870fd3e4fc Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 11 Sep 2023 16:27:22 +0800 Subject: workqueue: Fix missed pwq_release_worker creation in wq_cpu_intensive_thresh_init() Currently, if the wq_cpu_intensive_thresh_us is set to specific value, will cause the wq_cpu_intensive_thresh_init() early exit and missed creation of pwq_release_worker. this commit therefore create the pwq_release_worker in advance before checking the wq_cpu_intensive_thresh_us. Signed-off-by: Zqiang Signed-off-by: Tejun Heo Fixes: 967b494e2fd1 ("workqueue: Use a kthread_worker to release pool_workqueues") --- kernel/workqueue.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 129328b765fb..b9f053a5a5f0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -6602,13 +6602,13 @@ static void __init wq_cpu_intensive_thresh_init(void) unsigned long thresh; unsigned long bogo; + pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release"); + BUG_ON(IS_ERR(pwq_release_worker)); + /* if the user set it to a specific value, keep it */ if (wq_cpu_intensive_thresh_us != ULONG_MAX) return; - pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release"); - BUG_ON(IS_ERR(pwq_release_worker)); - /* * The default of 10ms is derived from the fact that most modern (as of * 2023) processors can do a lot in 10ms and that it's just below what -- cgit From 643445531829d89dc5ddbe0c5ee4ff8f84ce8687 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 20 Sep 2023 14:07:04 +0800 Subject: workqueue: Fix UAF report by KASAN in pwq_release_workfn() Currently, for UNBOUND wq, if the apply_wqattrs_prepare() return error, the apply_wqattr_cleanup() will be called and use the pwq_release_worker kthread to release resources asynchronously. however, the kfree(wq) is invoked directly in failure path of alloc_workqueue(), if the kfree(wq) has been executed and when the pwq_release_workfn() accesses wq, this leads to the following scenario: BUG: KASAN: slab-use-after-free in pwq_release_workfn+0x339/0x380 kernel/workqueue.c:4124 Read of size 4 at addr ffff888027b831c0 by task pool_workqueue_/3 CPU: 0 PID: 3 Comm: pool_workqueue_ Not tainted 6.5.0-rc7-next-20230825-syzkaller #0 Hardware name: Google Compute Engine/Google Compute Engine, BIOS Google 07/26/2023 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xd9/0x1b0 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:364 [inline] print_report+0xc4/0x620 mm/kasan/report.c:475 kasan_report+0xda/0x110 mm/kasan/report.c:588 pwq_release_workfn+0x339/0x380 kernel/workqueue.c:4124 kthread_worker_fn+0x2fc/0xa80 kernel/kthread.c:823 kthread+0x33a/0x430 kernel/kthread.c:388 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:304 Allocated by task 5054: kasan_save_stack+0x33/0x50 mm/kasan/common.c:45 kasan_set_track+0x25/0x30 mm/kasan/common.c:52 ____kasan_kmalloc mm/kasan/common.c:374 [inline] __kasan_kmalloc+0xa2/0xb0 mm/kasan/common.c:383 kmalloc include/linux/slab.h:599 [inline] kzalloc include/linux/slab.h:720 [inline] alloc_workqueue+0x16f/0x1490 kernel/workqueue.c:4684 kvm_mmu_init_tdp_mmu+0x23/0x100 arch/x86/kvm/mmu/tdp_mmu.c:19 kvm_mmu_init_vm+0x248/0x2e0 arch/x86/kvm/mmu/mmu.c:6180 kvm_arch_init_vm+0x39/0x720 arch/x86/kvm/x86.c:12311 kvm_create_vm arch/x86/kvm/../../../virt/kvm/kvm_main.c:1222 [inline] kvm_dev_ioctl_create_vm arch/x86/kvm/../../../virt/kvm/kvm_main.c:5089 [inline] kvm_dev_ioctl+0xa31/0x1c20 arch/x86/kvm/../../../virt/kvm/kvm_main.c:5131 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:871 [inline] __se_sys_ioctl fs/ioctl.c:857 [inline] __x64_sys_ioctl+0x18f/0x210 fs/ioctl.c:857 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Freed by task 5054: kasan_save_stack+0x33/0x50 mm/kasan/common.c:45 kasan_set_track+0x25/0x30 mm/kasan/common.c:52 kasan_save_free_info+0x2b/0x40 mm/kasan/generic.c:522 ____kasan_slab_free mm/kasan/common.c:236 [inline] ____kasan_slab_free+0x15b/0x1b0 mm/kasan/common.c:200 kasan_slab_free include/linux/kasan.h:164 [inline] slab_free_hook mm/slub.c:1800 [inline] slab_free_freelist_hook+0x114/0x1e0 mm/slub.c:1826 slab_free mm/slub.c:3809 [inline] __kmem_cache_free+0xb8/0x2f0 mm/slub.c:3822 alloc_workqueue+0xe76/0x1490 kernel/workqueue.c:4746 kvm_mmu_init_tdp_mmu+0x23/0x100 arch/x86/kvm/mmu/tdp_mmu.c:19 kvm_mmu_init_vm+0x248/0x2e0 arch/x86/kvm/mmu/mmu.c:6180 kvm_arch_init_vm+0x39/0x720 arch/x86/kvm/x86.c:12311 kvm_create_vm arch/x86/kvm/../../../virt/kvm/kvm_main.c:1222 [inline] kvm_dev_ioctl_create_vm arch/x86/kvm/../../../virt/kvm/kvm_main.c:5089 [inline] kvm_dev_ioctl+0xa31/0x1c20 arch/x86/kvm/../../../virt/kvm/kvm_main.c:5131 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:871 [inline] __se_sys_ioctl fs/ioctl.c:857 [inline] __x64_sys_ioctl+0x18f/0x210 fs/ioctl.c:857 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x38/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd This commit therefore flush pwq_release_worker in the alloc_and_link_pwqs() before invoke kfree(wq). Reported-by: syzbot+60db9f652c92d5bacba4@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=60db9f652c92d5bacba4 Signed-off-by: Zqiang Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b9f053a5a5f0..ebe24a5e1435 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4600,6 +4600,12 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) } cpus_read_unlock(); + /* for unbound pwq, flush the pwq_release_worker ensures that the + * pwq_release_workfn() completes before calling kfree(wq). + */ + if (ret) + kthread_flush_worker(pwq_release_worker); + return ret; enomem: -- cgit From 7b42f401fc6571b6604441789d892d440829e33c Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 11 Oct 2023 16:27:59 +0800 Subject: workqueue: Use the kmem_cache_free() instead of kfree() to release pwq Currently, the kfree() be used for pwq objects allocated with kmem_cache_alloc() in alloc_and_link_pwqs(), this isn't wrong. but usually, use "trace_kmem_cache_alloc/trace_kmem_cache_free" to track memory allocation and free. this commit therefore use kmem_cache_free() instead of kfree() in alloc_and_link_pwqs() and also consistent with release of the pwq in rcu_free_pwq(). Signed-off-by: Zqiang Signed-off-by: Tejun Heo --- kernel/workqueue.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ebe24a5e1435..6f74cab2bd5a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4610,8 +4610,12 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) enomem: if (wq->cpu_pwq) { - for_each_possible_cpu(cpu) - kfree(*per_cpu_ptr(wq->cpu_pwq, cpu)); + for_each_possible_cpu(cpu) { + struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); + + if (pwq) + kmem_cache_free(pwq_cache, pwq); + } free_percpu(wq->cpu_pwq); wq->cpu_pwq = NULL; } -- cgit From ca10d851b9ad0338c19e8e3089e24d565ebfffd7 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 10 Oct 2023 22:48:42 -0400 Subject: workqueue: Override implicit ordered attribute in workqueue_apply_unbound_cpumask() Commit 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered") enabled implicit ordered attribute to be added to WQ_UNBOUND workqueues with max_active of 1. This prevented the changing of attributes to these workqueues leading to fix commit 0a94efb5acbb ("workqueue: implicit ordered attribute should be overridable"). However, workqueue_apply_unbound_cpumask() was not updated at that time. So sysfs changes to wq_unbound_cpumask has no effect on WQ_UNBOUND workqueues with implicit ordered attribute. Since not all WQ_UNBOUND workqueues are visible on sysfs, we are not able to make all the necessary cpumask changes even if we iterates all the workqueue cpumasks in sysfs and changing them one by one. Fix this problem by applying the corresponding change made to apply_workqueue_attrs_locked() in the fix commit to workqueue_apply_unbound_cpumask(). Fixes: 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered") Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/workqueue.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6f74cab2bd5a..51177ffe16f1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5792,9 +5792,13 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) list_for_each_entry(wq, &workqueues, list) { if (!(wq->flags & WQ_UNBOUND)) continue; + /* creating multiple pwqs breaks ordering guarantee */ - if (wq->flags & __WQ_ORDERED) - continue; + if (!list_empty(&wq->pwqs)) { + if (wq->flags & __WQ_ORDERED_EXPLICIT) + continue; + wq->flags &= ~__WQ_ORDERED; + } ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask); if (IS_ERR(ctx)) { -- cgit From 5d9c7a1e3e8e18db8e10c546de648cda2a57be52 Mon Sep 17 00:00:00 2001 From: Lucy Mielke Date: Mon, 9 Oct 2023 19:09:46 +0200 Subject: workqueue: fix -Wformat-truncation in create_worker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiling with W=1 emitted the following warning (Compiler: gcc (x86-64, ver. 13.2.1, .config: result of make allyesconfig, "Treat warnings as errors" turned off): kernel/workqueue.c:2188:54: warning: ‘%d’ directive output may be truncated writing between 1 and 10 bytes into a region of size between 5 and 14 [-Wformat-truncation=] kernel/workqueue.c:2188:50: note: directive argument in the range [0, 2147483647] kernel/workqueue.c:2188:17: note: ‘snprintf’ output between 4 and 23 bytes into a destination of size 16 setting "id_buf" to size 23 will silence the warning, since GCC determines snprintf's output to be max. 23 bytes in line 2188. Please let me know if there are any mistakes in my patch! Signed-off-by: Lucy Mielke Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 51177ffe16f1..a3522b70218d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2166,7 +2166,7 @@ static struct worker *create_worker(struct worker_pool *pool) { struct worker *worker; int id; - char id_buf[16]; + char id_buf[23]; /* ID is needed to determine kthread name */ id = ida_alloc(&pool->worker_ida, GFP_KERNEL); -- cgit From 265f3ed077036f053981f5eea0b5b43e7c5b39ff Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 24 Sep 2023 17:07:02 +0200 Subject: workqueue: Provide one lock class key per work_on_cpu() callsite All callers of work_on_cpu() share the same lock class key for all the functions queued. As a result the workqueue related locking scenario for a function A may be spuriously accounted as an inversion against the locking scenario of function B such as in the following model: long A(void *arg) { mutex_lock(&mutex); mutex_unlock(&mutex); } long B(void *arg) { } void launchA(void) { work_on_cpu(0, A, NULL); } void launchB(void) { mutex_lock(&mutex); work_on_cpu(1, B, NULL); mutex_unlock(&mutex); } launchA and launchB running concurrently have no chance to deadlock. However the above can be reported by lockdep as a possible locking inversion because the works containing A() and B() are treated as belonging to the same locking class. The following shows an existing example of such a spurious lockdep splat: ====================================================== WARNING: possible circular locking dependency detected 6.6.0-rc1-00065-g934ebd6e5359 #35409 Not tainted ------------------------------------------------------ kworker/0:1/9 is trying to acquire lock: ffffffff9bc72f30 (cpu_hotplug_lock){++++}-{0:0}, at: _cpu_down+0x57/0x2b0 but task is already holding lock: ffff9e3bc0057e60 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_scheduled_works+0x216/0x500 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 ((work_completion)(&wfc.work)){+.+.}-{0:0}: __flush_work+0x83/0x4e0 work_on_cpu+0x97/0xc0 rcu_nocb_cpu_offload+0x62/0xb0 rcu_nocb_toggle+0xd0/0x1d0 kthread+0xe6/0x120 ret_from_fork+0x2f/0x40 ret_from_fork_asm+0x1b/0x30 -> #1 (rcu_state.barrier_mutex){+.+.}-{3:3}: __mutex_lock+0x81/0xc80 rcu_nocb_cpu_deoffload+0x38/0xb0 rcu_nocb_toggle+0x144/0x1d0 kthread+0xe6/0x120 ret_from_fork+0x2f/0x40 ret_from_fork_asm+0x1b/0x30 -> #0 (cpu_hotplug_lock){++++}-{0:0}: __lock_acquire+0x1538/0x2500 lock_acquire+0xbf/0x2a0 percpu_down_write+0x31/0x200 _cpu_down+0x57/0x2b0 __cpu_down_maps_locked+0x10/0x20 work_for_cpu_fn+0x15/0x20 process_scheduled_works+0x2a7/0x500 worker_thread+0x173/0x330 kthread+0xe6/0x120 ret_from_fork+0x2f/0x40 ret_from_fork_asm+0x1b/0x30 other info that might help us debug this: Chain exists of: cpu_hotplug_lock --> rcu_state.barrier_mutex --> (work_completion)(&wfc.work) Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock((work_completion)(&wfc.work)); lock(rcu_state.barrier_mutex); lock((work_completion)(&wfc.work)); lock(cpu_hotplug_lock); *** DEADLOCK *** 2 locks held by kworker/0:1/9: #0: ffff900481068b38 ((wq_completion)events){+.+.}-{0:0}, at: process_scheduled_works+0x212/0x500 #1: ffff9e3bc0057e60 ((work_completion)(&wfc.work)){+.+.}-{0:0}, at: process_scheduled_works+0x216/0x500 stack backtrace: CPU: 0 PID: 9 Comm: kworker/0:1 Not tainted 6.6.0-rc1-00065-g934ebd6e5359 #35409 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 Workqueue: events work_for_cpu_fn Call Trace: rcu-torture: rcu_torture_read_exit: Start of episode dump_stack_lvl+0x4a/0x80 check_noncircular+0x132/0x150 __lock_acquire+0x1538/0x2500 lock_acquire+0xbf/0x2a0 ? _cpu_down+0x57/0x2b0 percpu_down_write+0x31/0x200 ? _cpu_down+0x57/0x2b0 _cpu_down+0x57/0x2b0 __cpu_down_maps_locked+0x10/0x20 work_for_cpu_fn+0x15/0x20 process_scheduled_works+0x2a7/0x500 worker_thread+0x173/0x330 ? __pfx_worker_thread+0x10/0x10 kthread+0xe6/0x120 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2f/0x40 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1b/0x30 Signed-off-by: Frederic Weisbecker Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 46 +++++++++++++++++++++++++++++++++++++++------- kernel/workqueue.c | 20 ++++++++++++-------- 2 files changed, 51 insertions(+), 15 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 1c1d06804d45..24b1e5070f4d 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -274,18 +274,16 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } * to generate better code. */ #ifdef CONFIG_LOCKDEP -#define __INIT_WORK(_work, _func, _onstack) \ +#define __INIT_WORK_KEY(_work, _func, _onstack, _key) \ do { \ - static struct lock_class_key __key; \ - \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ - lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, &__key, 0); \ + lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, (_key), 0); \ INIT_LIST_HEAD(&(_work)->entry); \ (_work)->func = (_func); \ } while (0) #else -#define __INIT_WORK(_work, _func, _onstack) \ +#define __INIT_WORK_KEY(_work, _func, _onstack, _key) \ do { \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ @@ -294,12 +292,22 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; } } while (0) #endif +#define __INIT_WORK(_work, _func, _onstack) \ + do { \ + static __maybe_unused struct lock_class_key __key; \ + \ + __INIT_WORK_KEY(_work, _func, _onstack, &__key); \ + } while (0) + #define INIT_WORK(_work, _func) \ __INIT_WORK((_work), (_func), 0) #define INIT_WORK_ONSTACK(_work, _func) \ __INIT_WORK((_work), (_func), 1) +#define INIT_WORK_ONSTACK_KEY(_work, _func, _key) \ + __INIT_WORK_KEY((_work), (_func), 1, _key) + #define __INIT_DELAYED_WORK(_work, _func, _tflags) \ do { \ INIT_WORK(&(_work)->work, (_func)); \ @@ -693,8 +701,32 @@ static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) return fn(arg); } #else -long work_on_cpu(int cpu, long (*fn)(void *), void *arg); -long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg); +long work_on_cpu_key(int cpu, long (*fn)(void *), + void *arg, struct lock_class_key *key); +/* + * A new key is defined for each caller to make sure the work + * associated with the function doesn't share its locking class. + */ +#define work_on_cpu(_cpu, _fn, _arg) \ +({ \ + static struct lock_class_key __key; \ + \ + work_on_cpu_key(_cpu, _fn, _arg, &__key); \ +}) + +long work_on_cpu_safe_key(int cpu, long (*fn)(void *), + void *arg, struct lock_class_key *key); + +/* + * A new key is defined for each caller to make sure the work + * associated with the function doesn't share its locking class. + */ +#define work_on_cpu_safe(_cpu, _fn, _arg) \ +({ \ + static struct lock_class_key __key; \ + \ + work_on_cpu_safe_key(_cpu, _fn, _arg, &__key); \ +}) #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a3522b70218d..0f682da96e1c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5622,50 +5622,54 @@ static void work_for_cpu_fn(struct work_struct *work) } /** - * work_on_cpu - run a function in thread context on a particular cpu + * work_on_cpu_key - run a function in thread context on a particular cpu * @cpu: the cpu to run on * @fn: the function to run * @arg: the function arg + * @key: The lock class key for lock debugging purposes * * It is up to the caller to ensure that the cpu doesn't go offline. * The caller must not hold any locks which would prevent @fn from completing. * * Return: The value @fn returns. */ -long work_on_cpu(int cpu, long (*fn)(void *), void *arg) +long work_on_cpu_key(int cpu, long (*fn)(void *), + void *arg, struct lock_class_key *key) { struct work_for_cpu wfc = { .fn = fn, .arg = arg }; - INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); + INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key); schedule_work_on(cpu, &wfc.work); flush_work(&wfc.work); destroy_work_on_stack(&wfc.work); return wfc.ret; } -EXPORT_SYMBOL_GPL(work_on_cpu); +EXPORT_SYMBOL_GPL(work_on_cpu_key); /** - * work_on_cpu_safe - run a function in thread context on a particular cpu + * work_on_cpu_safe_key - run a function in thread context on a particular cpu * @cpu: the cpu to run on * @fn: the function to run * @arg: the function argument + * @key: The lock class key for lock debugging purposes * * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold * any locks which would prevent @fn from completing. * * Return: The value @fn returns. */ -long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) +long work_on_cpu_safe_key(int cpu, long (*fn)(void *), + void *arg, struct lock_class_key *key) { long ret = -ENODEV; cpus_read_lock(); if (cpu_online(cpu)) - ret = work_on_cpu(cpu, fn, arg); + ret = work_on_cpu_key(cpu, fn, arg, key); cpus_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(work_on_cpu_safe); +EXPORT_SYMBOL_GPL(work_on_cpu_safe_key); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER -- cgit From 68279f9c9f592e75d30a9ba5154a15e0a0b42ae8 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 11 Oct 2023 19:55:00 +0300 Subject: treewide: mark stuff as __ro_after_init __read_mostly predates __ro_after_init. Many variables which are marked __read_mostly should have been __ro_after_init from day 1. Also, mark some stuff as "const" and "__init" while I'm at it. [akpm@linux-foundation.org: revert sysctl_nr_open_min, sysctl_nr_open_max changes due to arm warning] [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/4f6bb9c0-abba-4ee4-a7aa-89265e886817@p183 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton --- block/bdev.c | 6 +++--- fs/anon_inodes.c | 4 ++-- fs/buffer.c | 4 ++-- fs/char_dev.c | 2 +- fs/dcache.c | 8 ++++---- fs/direct-io.c | 2 +- fs/eventpoll.c | 6 +++--- fs/fcntl.c | 2 +- fs/file_table.c | 2 +- fs/inode.c | 8 ++++---- fs/kernfs/mount.c | 5 +++-- fs/locks.c | 4 ++-- fs/namespace.c | 16 ++++++++-------- fs/notify/dnotify/dnotify.c | 6 +++--- fs/notify/fanotify/fanotify_user.c | 8 ++++---- fs/notify/inotify/inotify_user.c | 2 +- fs/pipe.c | 2 +- fs/userfaultfd.c | 2 +- kernel/audit_tree.c | 4 ++-- kernel/sched/core.c | 2 +- kernel/user_namespace.c | 2 +- kernel/workqueue.c | 16 ++++++++-------- lib/debugobjects.c | 2 +- mm/khugepaged.c | 2 +- mm/shmem.c | 8 ++++---- security/integrity/iint.c | 2 +- 26 files changed, 64 insertions(+), 63 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/block/bdev.c b/block/bdev.c index f3b13aa1b7d4..aea9143d8908 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -292,7 +292,7 @@ EXPORT_SYMBOL(thaw_bdev); */ static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock); -static struct kmem_cache * bdev_cachep __read_mostly; +static struct kmem_cache *bdev_cachep __ro_after_init; static struct inode *bdev_alloc_inode(struct super_block *sb) { @@ -361,13 +361,13 @@ static struct file_system_type bd_type = { .kill_sb = kill_anon_super, }; -struct super_block *blockdev_superblock __read_mostly; +struct super_block *blockdev_superblock __ro_after_init; EXPORT_SYMBOL_GPL(blockdev_superblock); void __init bdev_cache_init(void) { int err; - static struct vfsmount *bd_mnt; + static struct vfsmount *bd_mnt __ro_after_init; bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 24192a7667ed..d26222b7eefe 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -24,8 +24,8 @@ #include -static struct vfsmount *anon_inode_mnt __read_mostly; -static struct inode *anon_inode_inode; +static struct vfsmount *anon_inode_mnt __ro_after_init; +static struct inode *anon_inode_inode __ro_after_init; /* * anon_inodefs_dname() is called from d_path(). diff --git a/fs/buffer.c b/fs/buffer.c index 12e9a71c693d..a19fef583116 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2988,13 +2988,13 @@ EXPORT_SYMBOL(try_to_free_buffers); /* * Buffer-head allocation */ -static struct kmem_cache *bh_cachep __read_mostly; +static struct kmem_cache *bh_cachep __ro_after_init; /* * Once the number of bh's in the machine exceeds this level, we start * stripping them in writeback. */ -static unsigned long max_buffer_heads; +static unsigned long max_buffer_heads __ro_after_init; int buffer_heads_over_limit; diff --git a/fs/char_dev.c b/fs/char_dev.c index 950b6919fb87..3d52f3d3ae77 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -25,7 +25,7 @@ #include "internal.h" -static struct kobj_map *cdev_map; +static struct kobj_map *cdev_map __ro_after_init; static DEFINE_MUTEX(chrdevs_lock); diff --git a/fs/dcache.c b/fs/dcache.c index 25ac74d30bff..0650ccdaa335 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -78,7 +78,7 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); -static struct kmem_cache *dentry_cache __read_mostly; +static struct kmem_cache *dentry_cache __ro_after_init; const struct qstr empty_name = QSTR_INIT("", 0); EXPORT_SYMBOL(empty_name); @@ -96,9 +96,9 @@ EXPORT_SYMBOL(dotdot_name); * information, yet avoid using a prime hash-size or similar. */ -static unsigned int d_hash_shift __read_mostly; +static unsigned int d_hash_shift __ro_after_init; -static struct hlist_bl_head *dentry_hashtable __read_mostly; +static struct hlist_bl_head *dentry_hashtable __ro_after_init; static inline struct hlist_bl_head *d_hash(unsigned int hash) { @@ -3324,7 +3324,7 @@ static void __init dcache_init(void) } /* SLAB cache for __getname() consumers */ -struct kmem_cache *names_cachep __read_mostly; +struct kmem_cache *names_cachep __ro_after_init; EXPORT_SYMBOL(names_cachep); void __init vfs_caches_init_early(void) diff --git a/fs/direct-io.c b/fs/direct-io.c index 7bc494ee56b9..20533266ade6 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -151,7 +151,7 @@ struct dio { }; } ____cacheline_aligned_in_smp; -static struct kmem_cache *dio_cache __read_mostly; +static struct kmem_cache *dio_cache __ro_after_init; /* * How many pages are in the queue? diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1d9a71a0c4c1..2877cc01cff1 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -256,10 +256,10 @@ static u64 loop_check_gen = 0; static struct eventpoll *inserting_into; /* Slab cache used to allocate "struct epitem" */ -static struct kmem_cache *epi_cache __read_mostly; +static struct kmem_cache *epi_cache __ro_after_init; /* Slab cache used to allocate "struct eppoll_entry" */ -static struct kmem_cache *pwq_cache __read_mostly; +static struct kmem_cache *pwq_cache __ro_after_init; /* * List of files with newly added links, where we may need to limit the number @@ -271,7 +271,7 @@ struct epitems_head { }; static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR; -static struct kmem_cache *ephead_cache __read_mostly; +static struct kmem_cache *ephead_cache __ro_after_init; static inline void free_ephead(struct epitems_head *head) { diff --git a/fs/fcntl.c b/fs/fcntl.c index e871009f6c88..c80a6acad742 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -844,7 +844,7 @@ int send_sigurg(struct fown_struct *fown) } static DEFINE_SPINLOCK(fasync_lock); -static struct kmem_cache *fasync_cache __read_mostly; +static struct kmem_cache *fasync_cache __ro_after_init; static void fasync_free_rcu(struct rcu_head *head) { diff --git a/fs/file_table.c b/fs/file_table.c index ee21b3da9d08..687d33865035 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -40,7 +40,7 @@ static struct files_stat_struct files_stat = { }; /* SLAB cache for file structures */ -static struct kmem_cache *filp_cachep __read_mostly; +static struct kmem_cache *filp_cachep __ro_after_init; static struct percpu_counter nr_files __cacheline_aligned_in_smp; diff --git a/fs/inode.c b/fs/inode.c index 84bc3c76e5cc..2b1d473a3631 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -54,9 +54,9 @@ * inode_hash_lock */ -static unsigned int i_hash_mask __read_mostly; -static unsigned int i_hash_shift __read_mostly; -static struct hlist_head *inode_hashtable __read_mostly; +static unsigned int i_hash_mask __ro_after_init; +static unsigned int i_hash_shift __ro_after_init; +static struct hlist_head *inode_hashtable __ro_after_init; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); /* @@ -70,7 +70,7 @@ EXPORT_SYMBOL(empty_aops); static DEFINE_PER_CPU(unsigned long, nr_inodes); static DEFINE_PER_CPU(unsigned long, nr_unused); -static struct kmem_cache *inode_cachep __read_mostly; +static struct kmem_cache *inode_cachep __ro_after_init; static long get_nr_inodes(void) { diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index c4bf26142eec..43aea0ad95c8 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -21,8 +21,9 @@ #include "kernfs-internal.h" -struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; -struct kernfs_global_locks *kernfs_locks; +struct kmem_cache *kernfs_node_cache __ro_after_init; +struct kmem_cache *kernfs_iattrs_cache __ro_after_init; +struct kernfs_global_locks *kernfs_locks __ro_after_init; static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) { diff --git a/fs/locks.c b/fs/locks.c index 76ad05f8070a..dbd2fb1f7494 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -167,8 +167,8 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS); */ static DEFINE_SPINLOCK(blocked_lock_lock); -static struct kmem_cache *flctx_cache __read_mostly; -static struct kmem_cache *filelock_cache __read_mostly; +static struct kmem_cache *flctx_cache __ro_after_init; +static struct kmem_cache *filelock_cache __ro_after_init; static struct file_lock_context * locks_get_lock_context(struct inode *inode, int type) diff --git a/fs/namespace.c b/fs/namespace.c index e157efc54023..69df848fbc17 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -39,10 +39,10 @@ /* Maximum number of mounts in a mount namespace */ static unsigned int sysctl_mount_max __read_mostly = 100000; -static unsigned int m_hash_mask __read_mostly; -static unsigned int m_hash_shift __read_mostly; -static unsigned int mp_hash_mask __read_mostly; -static unsigned int mp_hash_shift __read_mostly; +static unsigned int m_hash_mask __ro_after_init; +static unsigned int m_hash_shift __ro_after_init; +static unsigned int mp_hash_mask __ro_after_init; +static unsigned int mp_hash_shift __ro_after_init; static __initdata unsigned long mhash_entries; static int __init set_mhash_entries(char *str) @@ -68,9 +68,9 @@ static u64 event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); -static struct hlist_head *mount_hashtable __read_mostly; -static struct hlist_head *mountpoint_hashtable __read_mostly; -static struct kmem_cache *mnt_cache __read_mostly; +static struct hlist_head *mount_hashtable __ro_after_init; +static struct hlist_head *mountpoint_hashtable __ro_after_init; +static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ @@ -86,7 +86,7 @@ struct mount_kattr { }; /* /sys/fs */ -struct kobject *fs_kobj; +struct kobject *fs_kobj __ro_after_init; EXPORT_SYMBOL_GPL(fs_kobj); /* diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index ebdcc25df0f7..7914d223289a 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -39,9 +39,9 @@ static void __init dnotify_sysctl_init(void) #define dnotify_sysctl_init() do { } while (0) #endif -static struct kmem_cache *dnotify_struct_cache __read_mostly; -static struct kmem_cache *dnotify_mark_cache __read_mostly; -static struct fsnotify_group *dnotify_group __read_mostly; +static struct kmem_cache *dnotify_struct_cache __ro_after_init; +static struct kmem_cache *dnotify_mark_cache __ro_after_init; +static struct fsnotify_group *dnotify_group __ro_after_init; /* * dnotify will attach one of these to each inode (i_fsnotify_marks) which diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index f69c451018e3..614b435c4a8c 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -112,10 +112,10 @@ static void __init fanotify_sysctls_init(void) extern const struct fsnotify_ops fanotify_fsnotify_ops; -struct kmem_cache *fanotify_mark_cache __read_mostly; -struct kmem_cache *fanotify_fid_event_cachep __read_mostly; -struct kmem_cache *fanotify_path_event_cachep __read_mostly; -struct kmem_cache *fanotify_perm_event_cachep __read_mostly; +struct kmem_cache *fanotify_mark_cache __ro_after_init; +struct kmem_cache *fanotify_fid_event_cachep __ro_after_init; +struct kmem_cache *fanotify_path_event_cachep __ro_after_init; +struct kmem_cache *fanotify_perm_event_cachep __ro_after_init; #define FANOTIFY_EVENT_ALIGN 4 #define FANOTIFY_FID_INFO_HDR_LEN \ diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 1c4bfdab008d..a3809ae92170 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -49,7 +49,7 @@ /* configurable via /proc/sys/fs/inotify/ */ static int inotify_max_queued_events __read_mostly; -struct kmem_cache *inotify_inode_mark_cachep __read_mostly; +struct kmem_cache *inotify_inode_mark_cachep __ro_after_init; #ifdef CONFIG_SYSCTL diff --git a/fs/pipe.c b/fs/pipe.c index 139190165a1c..6b279abf0129 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -854,7 +854,7 @@ void free_pipe_info(struct pipe_inode_info *pipe) kfree(pipe); } -static struct vfsmount *pipe_mnt __read_mostly; +static struct vfsmount *pipe_mnt __ro_after_init; /* * pipefs_dname() is called from d_path(). diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 56eaae9dac1a..ed09d70027a0 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -49,7 +49,7 @@ static struct ctl_table vm_userfaultfd_table[] = { }; #endif -static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; +static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init; /* * Start with fault_pending_wqh and fault_wqh so they're more likely diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index e867c17d3f84..b21b9652c1a8 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -87,8 +87,8 @@ static struct task_struct *prune_thread; * that makes a difference. Some. */ -static struct fsnotify_group *audit_tree_group; -static struct kmem_cache *audit_tree_mark_cachep __read_mostly; +static struct fsnotify_group *audit_tree_group __ro_after_init; +static struct kmem_cache *audit_tree_mark_cachep __ro_after_init; static struct audit_tree *alloc_tree(const char *s) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 802551e0009b..7d4cf741e087 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9903,7 +9903,7 @@ struct task_group root_task_group; LIST_HEAD(task_groups); /* Cacheline aligned slab cache for task_group */ -static struct kmem_cache *task_group_cache __read_mostly; +static struct kmem_cache *task_group_cache __ro_after_init; #endif void __init sched_init(void) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 1d8e47bed3f1..bf2cb8c11571 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -22,7 +22,7 @@ #include #include -static struct kmem_cache *user_ns_cachep __read_mostly; +static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b9f053a5a5f0..96b89f0edbe3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -418,21 +418,21 @@ static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; * process context while holding a pool lock. Bounce to a dedicated kthread * worker to avoid A-A deadlocks. */ -static struct kthread_worker *pwq_release_worker; +static struct kthread_worker *pwq_release_worker __ro_after_init; -struct workqueue_struct *system_wq __read_mostly; +struct workqueue_struct *system_wq __ro_after_init; EXPORT_SYMBOL(system_wq); -struct workqueue_struct *system_highpri_wq __read_mostly; +struct workqueue_struct *system_highpri_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_highpri_wq); -struct workqueue_struct *system_long_wq __read_mostly; +struct workqueue_struct *system_long_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_long_wq); -struct workqueue_struct *system_unbound_wq __read_mostly; +struct workqueue_struct *system_unbound_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_unbound_wq); -struct workqueue_struct *system_freezable_wq __read_mostly; +struct workqueue_struct *system_freezable_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_freezable_wq); -struct workqueue_struct *system_power_efficient_wq __read_mostly; +struct workqueue_struct *system_power_efficient_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_power_efficient_wq); -struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly; +struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); diff --git a/lib/debugobjects.c b/lib/debugobjects.c index a517256a270b..2a8e9d63fbe3 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -89,7 +89,7 @@ static int debug_objects_pool_size __read_mostly static int debug_objects_pool_min_level __read_mostly = ODEBUG_POOL_MIN_LEVEL; static const struct debug_obj_descr *descr_test __read_mostly; -static struct kmem_cache *obj_cache __read_mostly; +static struct kmem_cache *obj_cache __ro_after_init; /* * Track numbers of kmem_cache_alloc()/free() calls done. diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 88433cc25d8a..cb3f1d738810 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -91,7 +91,7 @@ static unsigned int khugepaged_max_ptes_shared __read_mostly; #define MM_SLOTS_HASH_BITS 10 static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); -static struct kmem_cache *mm_slot_cache __read_mostly; +static struct kmem_cache *mm_slot_cache __ro_after_init; struct collapse_control { bool is_khugepaged; diff --git a/mm/shmem.c b/mm/shmem.c index 69595d341882..389212972e72 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -42,7 +42,7 @@ #include #include "swap.h" -static struct vfsmount *shm_mnt; +static struct vfsmount *shm_mnt __ro_after_init; #ifdef CONFIG_SHMEM /* @@ -4394,7 +4394,7 @@ static const struct fs_context_operations shmem_fs_context_ops = { #endif }; -static struct kmem_cache *shmem_inode_cachep; +static struct kmem_cache *shmem_inode_cachep __ro_after_init; static struct inode *shmem_alloc_inode(struct super_block *sb) { @@ -4426,14 +4426,14 @@ static void shmem_init_inode(void *foo) inode_init_once(&info->vfs_inode); } -static void shmem_init_inodecache(void) +static void __init shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); } -static void shmem_destroy_inodecache(void) +static void __init shmem_destroy_inodecache(void) { kmem_cache_destroy(shmem_inode_cachep); } diff --git a/security/integrity/iint.c b/security/integrity/iint.c index a462df827de2..5b1c2de6cc54 100644 --- a/security/integrity/iint.c +++ b/security/integrity/iint.c @@ -23,7 +23,7 @@ static struct rb_root integrity_iint_tree = RB_ROOT; static DEFINE_RWLOCK(integrity_iint_lock); -static struct kmem_cache *iint_cache __read_mostly; +static struct kmem_cache *iint_cache __ro_after_init; struct dentry *integrity_dir; -- cgit From fe28f631fa941fba583d1c4f25895284b90af671 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 25 Oct 2023 14:25:52 -0400 Subject: workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs from wq_unbound_cpumask When the "isolcpus" boot command line option is used to add a set of isolated CPUs, those CPUs will be excluded automatically from wq_unbound_cpumask to avoid running work functions from unbound workqueues. Recently cpuset has been extended to allow the creation of partitions of isolated CPUs dynamically. To make it closer to the "isolcpus" in functionality, the CPUs in those isolated cpuset partitions should be excluded from wq_unbound_cpumask as well. This can be done currently by explicitly writing to the workqueue's cpumask sysfs file after creating the isolated partitions. However, this process can be error prone. Ideally, the cpuset code should be allowed to request the workqueue code to exclude those isolated CPUs from wq_unbound_cpumask so that this operation can be done automatically and the isolated CPUs will be returned back to wq_unbound_cpumask after the destructions of the isolated cpuset partitions. This patch adds a new workqueue_unbound_exclude_cpumask() function to enable that. This new function will exclude the specified isolated CPUs from wq_unbound_cpumask. To be able to restore those isolated CPUs back after the destruction of isolated cpuset partitions, a new wq_requested_unbound_cpumask is added to store the user provided unbound cpumask either from the boot command line options or from writing to the cpumask sysfs file. This new cpumask provides the basis for CPU exclusion. To enable users to understand how the wq_unbound_cpumask is being modified internally, this patch also exposes the newly introduced wq_requested_unbound_cpumask as well as a wq_isolated_cpumask to store the cpumask to be excluded from wq_unbound_cpumask as read-only sysfs files. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 +- kernel/workqueue.c | 91 ++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 84 insertions(+), 9 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 24b1e5070f4d..b0b9604b76b8 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -491,7 +491,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(void); void free_workqueue_attrs(struct workqueue_attrs *attrs); int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs); -int workqueue_set_unbound_cpumask(cpumask_var_t cpumask); +extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6e578f576a6f..bd9d34eacd78 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -381,6 +381,12 @@ static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* PL&A: allowable cpus for unbound wqs and work items */ static cpumask_var_t wq_unbound_cpumask; +/* PL: user requested unbound cpumask via sysfs */ +static cpumask_var_t wq_requested_unbound_cpumask; + +/* PL: isolated cpumask to be excluded from unbound cpumask */ +static cpumask_var_t wq_isolated_cpumask; + /* for further constrain wq_unbound_cpumask by cmdline parameter*/ static struct cpumask wq_cmdline_cpumask __initdata; @@ -5839,7 +5845,7 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) * -EINVAL - Invalid @cpumask * -ENOMEM - Failed to allocate memory for attrs or pwqs. */ -int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) +static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) { int ret = -EINVAL; @@ -5850,6 +5856,7 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { apply_wqattrs_lock(); + cpumask_copy(wq_requested_unbound_cpumask, cpumask); if (cpumask_equal(cpumask, wq_unbound_cpumask)) { ret = 0; goto out_unlock; @@ -5864,6 +5871,44 @@ out_unlock: return ret; } +/** + * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask + * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask + * + * This function can be called from cpuset code to provide a set of isolated + * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold + * either cpus_read_lock or cpus_write_lock. + */ +int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask) +{ + cpumask_var_t cpumask; + int ret = 0; + + if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + lockdep_assert_cpus_held(); + mutex_lock(&wq_pool_mutex); + + /* Save the current isolated cpumask & export it via sysfs */ + cpumask_copy(wq_isolated_cpumask, exclude_cpumask); + + /* + * If the operation fails, it will fall back to + * wq_requested_unbound_cpumask which is initially set to + * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten + * by any subsequent write to workqueue/cpumask sysfs file. + */ + if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask)) + cpumask_copy(cpumask, wq_requested_unbound_cpumask); + if (!cpumask_equal(cpumask, wq_unbound_cpumask)) + ret = workqueue_apply_unbound_cpumask(cpumask); + + mutex_unlock(&wq_pool_mutex); + free_cpumask_var(cpumask); + return ret; +} + static int parse_affn_scope(const char *val) { int i; @@ -6158,19 +6203,36 @@ static struct bus_type wq_subsys = { .dev_groups = wq_sysfs_groups, }; -static ssize_t wq_unbound_cpumask_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t __wq_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf, cpumask_var_t mask) { int written; mutex_lock(&wq_pool_mutex); - written = scnprintf(buf, PAGE_SIZE, "%*pb\n", - cpumask_pr_args(wq_unbound_cpumask)); + written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); mutex_unlock(&wq_pool_mutex); return written; } +static ssize_t wq_unbound_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask); +} + +static ssize_t wq_requested_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask); +} + +static ssize_t wq_isolated_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask); +} + static ssize_t wq_unbound_cpumask_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { @@ -6188,9 +6250,13 @@ static ssize_t wq_unbound_cpumask_store(struct device *dev, return ret ? ret : count; } -static struct device_attribute wq_sysfs_cpumask_attr = +static struct device_attribute wq_sysfs_cpumask_attrs[] = { __ATTR(cpumask, 0644, wq_unbound_cpumask_show, - wq_unbound_cpumask_store); + wq_unbound_cpumask_store), + __ATTR(cpumask_requested, 0444, wq_requested_cpumask_show, NULL), + __ATTR(cpumask_isolated, 0444, wq_isolated_cpumask_show, NULL), + __ATTR_NULL, +}; static int __init wq_sysfs_init(void) { @@ -6203,7 +6269,13 @@ static int __init wq_sysfs_init(void) dev_root = bus_get_dev_root(&wq_subsys); if (dev_root) { - err = device_create_file(dev_root, &wq_sysfs_cpumask_attr); + struct device_attribute *attr; + + for (attr = wq_sysfs_cpumask_attrs; attr->attr.name; attr++) { + err = device_create_file(dev_root, attr); + if (err) + break; + } put_device(dev_root); } return err; @@ -6534,11 +6606,14 @@ void __init workqueue_init_early(void) BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL)); + BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL)); cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ)); cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (!cpumask_empty(&wq_cmdline_cpumask)) cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask); + cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); -- cgit From 49277a5b76373e630075ff7d32fc0f9f51294f24 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 Nov 2023 21:18:40 -0500 Subject: workqueue: Move workqueue_set_unbound_cpumask() and its helpers inside CONFIG_SYSFS Commit fe28f631fa94 ("workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs from wq_unbound_cpumask") makes workqueue_set_unbound_cpumask() static as it is not used elsewhere in the kernel. However, this triggers a kernel test robot warning about 'workqueue_set_unbound_cpumask' defined but not used when CONFIG_SYS isn't defined. It happens that workqueue_set_unbound_cpumask() is only called when CONFIG_SYS is defined. Move workqueue_set_unbound_cpumask() and its helpers inside the CONFIG_SYSFS compilation block to avoid the warning. There is no functional change. Fixes: fe28f631fa94 ("workqueue: Add workqueue_unbound_exclude_cpumask() to exclude CPUs from wq_unbound_cpumask") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202311130831.uh0AoCd1-lkp@intel.com/ Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/workqueue.c | 102 ++++++++++++++++++++++++++--------------------------- 1 file changed, 51 insertions(+), 51 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bd9d34eacd78..2fc585d3d6ca 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4417,19 +4417,6 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) mutex_unlock(&ctx->wq->mutex); } -static void apply_wqattrs_lock(void) -{ - /* CPUs should stay stable across pwq creations and installations */ - cpus_read_lock(); - mutex_lock(&wq_pool_mutex); -} - -static void apply_wqattrs_unlock(void) -{ - mutex_unlock(&wq_pool_mutex); - cpus_read_unlock(); -} - static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { @@ -5833,44 +5820,6 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask) return ret; } -/** - * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask - * @cpumask: the cpumask to set - * - * The low-level workqueues cpumask is a global cpumask that limits - * the affinity of all unbound workqueues. This function check the @cpumask - * and apply it to all unbound workqueues and updates all pwqs of them. - * - * Return: 0 - Success - * -EINVAL - Invalid @cpumask - * -ENOMEM - Failed to allocate memory for attrs or pwqs. - */ -static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) -{ - int ret = -EINVAL; - - /* - * Not excluding isolated cpus on purpose. - * If the user wishes to include them, we allow that. - */ - cpumask_and(cpumask, cpumask, cpu_possible_mask); - if (!cpumask_empty(cpumask)) { - apply_wqattrs_lock(); - cpumask_copy(wq_requested_unbound_cpumask, cpumask); - if (cpumask_equal(cpumask, wq_unbound_cpumask)) { - ret = 0; - goto out_unlock; - } - - ret = workqueue_apply_unbound_cpumask(cpumask); - -out_unlock: - apply_wqattrs_unlock(); - } - - return ret; -} - /** * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask @@ -6027,6 +5976,19 @@ static struct attribute *wq_sysfs_attrs[] = { }; ATTRIBUTE_GROUPS(wq_sysfs); +static void apply_wqattrs_lock(void) +{ + /* CPUs should stay stable across pwq creations and installations */ + cpus_read_lock(); + mutex_lock(&wq_pool_mutex); +} + +static void apply_wqattrs_unlock(void) +{ + mutex_unlock(&wq_pool_mutex); + cpus_read_unlock(); +} + static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -6203,6 +6165,44 @@ static struct bus_type wq_subsys = { .dev_groups = wq_sysfs_groups, }; +/** + * workqueue_set_unbound_cpumask - Set the low-level unbound cpumask + * @cpumask: the cpumask to set + * + * The low-level workqueues cpumask is a global cpumask that limits + * the affinity of all unbound workqueues. This function check the @cpumask + * and apply it to all unbound workqueues and updates all pwqs of them. + * + * Return: 0 - Success + * -EINVAL - Invalid @cpumask + * -ENOMEM - Failed to allocate memory for attrs or pwqs. + */ +static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) +{ + int ret = -EINVAL; + + /* + * Not excluding isolated cpus on purpose. + * If the user wishes to include them, we allow that. + */ + cpumask_and(cpumask, cpumask, cpu_possible_mask); + if (!cpumask_empty(cpumask)) { + apply_wqattrs_lock(); + cpumask_copy(wq_requested_unbound_cpumask, cpumask); + if (cpumask_equal(cpumask, wq_unbound_cpumask)) { + ret = 0; + goto out_unlock; + } + + ret = workqueue_apply_unbound_cpumask(cpumask); + +out_unlock: + apply_wqattrs_unlock(); + } + + return ret; +} + static ssize_t __wq_cpumask_show(struct device *dev, struct device_attribute *attr, char *buf, cpumask_var_t mask) { -- cgit From 4a6c5607d4502ccd1b15b57d57f17d12b6f257a7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 21 Nov 2023 11:39:36 -1000 Subject: workqueue: Make sure that wq_unbound_cpumask is never empty During boot, depending on how the housekeeping and workqueue.unbound_cpus masks are set, wq_unbound_cpumask can end up empty. Since 8639ecebc9b1 ("workqueue: Implement non-strict affinity scope for unbound workqueues"), this may end up feeding -1 as a CPU number into scheduler leading to oopses. BUG: unable to handle page fault for address: ffffffff8305e9c0 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page ... Call Trace: select_idle_sibling+0x79/0xaf0 select_task_rq_fair+0x1cb/0x7b0 try_to_wake_up+0x29c/0x5c0 wake_up_process+0x19/0x20 kick_pool+0x5e/0xb0 __queue_work+0x119/0x430 queue_work_on+0x29/0x30 ... An empty wq_unbound_cpumask is a clear misconfiguration and already disallowed once system is booted up. Let's warn on and ignore unbound_cpumask restrictions which lead to no unbound cpus. While at it, also remove now unncessary empty check on wq_unbound_cpumask in wq_select_unbound_cpu(). Signed-off-by: Tejun Heo Reported-and-Tested-by: Yong He Link: http://lkml.kernel.org/r/20231120121623.119780-1-alexyonghe@tencent.com Fixes: 8639ecebc9b1 ("workqueue: Implement non-strict affinity scope for unbound workqueues") Cc: stable@vger.kernel.org # v6.6+ Reviewed-by: Waiman Long --- kernel/workqueue.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6e578f576a6f..2989b57e154a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1684,9 +1684,6 @@ static int wq_select_unbound_cpu(int cpu) pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n"); } - if (cpumask_empty(wq_unbound_cpumask)) - return cpu; - new_cpu = __this_cpu_read(wq_rr_cpu_last); new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask); if (unlikely(new_cpu >= nr_cpu_ids)) { @@ -6515,6 +6512,17 @@ static inline void wq_watchdog_init(void) { } #endif /* CONFIG_WQ_WATCHDOG */ +static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask) +{ + if (!cpumask_intersects(wq_unbound_cpumask, mask)) { + pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n", + cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask)); + return; + } + + cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask); +} + /** * workqueue_init_early - early init for workqueue subsystem * @@ -6534,11 +6542,11 @@ void __init workqueue_init_early(void) BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ)); - cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); - + cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); + restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ)); + restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN)); if (!cpumask_empty(&wq_cmdline_cpumask)) - cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask); + restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); -- cgit