From 0cce06ba859a515bd06224085d3addb870608b6d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Apr 2023 17:03:13 +0200 Subject: debugobjects,locking: Annotate debug_object_fill_pool() wait type violation There is an explicit wait-type violation in debug_object_fill_pool() for PREEMPT_RT=n kernels which allows them to more easily fill the object pool and reduce the chance of allocation failures. Lockdep's wait-type checks are designed to check the PREEMPT_RT locking rules even for PREEMPT_RT=n kernels and object to this, so create a lockdep annotation to allow this to stand. Specifically, create a 'lock' type that overrides the inner wait-type while it is held -- allowing one to temporarily raise it, such that the violation is hidden. Reported-by: Vlastimil Babka Reported-by: Qi Zheng Signed-off-by: Peter Zijlstra (Intel) Tested-by: Qi Zheng Link: https://lkml.kernel.org/r/20230429100614.GA1489784@hirez.programming.kicks-ass.net --- include/linux/lockdep.h | 14 ++++++++++++++ include/linux/lockdep_types.h | 1 + 2 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 1023f349af71..a3329fb49b33 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -339,6 +339,16 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) #define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) +/* + * Must use lock_map_aquire_try() with override maps to avoid + * lockdep thinking they participate in the block chain. + */ +#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ + struct lockdep_map _name = { \ + .name = #_name "-wait-type-override", \ + .wait_type_inner = _wait_type, \ + .lock_type = LD_LOCK_WAIT_OVERRIDE, } + #else /* !CONFIG_LOCKDEP */ static inline void lockdep_init_task(struct task_struct *task) @@ -427,6 +437,9 @@ extern int lockdep_is_held(const void *); #define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) #define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) +#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ + struct lockdep_map __maybe_unused _name = {} + #endif /* !LOCKDEP */ enum xhlock_context_t { @@ -551,6 +564,7 @@ do { \ #define rwsem_release(l, i) lock_release(l, i) #define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_) +#define lock_map_acquire_try(l) lock_acquire_exclusive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_tryread(l) lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_release(l) lock_release(l, _THIS_IP_) diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h index d22430840b53..59f4fb1626ea 100644 --- a/include/linux/lockdep_types.h +++ b/include/linux/lockdep_types.h @@ -33,6 +33,7 @@ enum lockdep_wait_type { enum lockdep_lock_type { LD_LOCK_NORMAL = 0, /* normal, catch all */ LD_LOCK_PERCPU, /* percpu */ + LD_LOCK_WAIT_OVERRIDE, /* annotation */ LD_LOCK_MAX, }; -- cgit From 23a5b8bb022c1e071ca91b1a9c10f0ad6a0966e9 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 27 Apr 2023 00:33:36 -0500 Subject: x86/amd_nb: Add PCI ID for family 19h model 78h Commit 310e782a99c7 ("platform/x86/amd: pmc: Utilize SMN index 0 for driver probe") switched to using amd_smn_read() which relies upon the misc PCI ID used by DF function 3 being included in a table. The ID for model 78h is missing in that table, so amd_smn_read() doesn't work. Add the missing ID into amd_nb, restoring s2idle on this system. [ bp: Simplify commit message. ] Fixes: 310e782a99c7 ("platform/x86/amd: pmc: Utilize SMN index 0 for driver probe") Signed-off-by: Mario Limonciello Signed-off-by: Borislav Petkov (AMD) Acked-by: Bjorn Helgaas # pci_ids.h Acked-by: Guenter Roeck Link: https://lore.kernel.org/r/20230427053338.16653-2-mario.limonciello@amd.com --- arch/x86/kernel/amd_nb.c | 2 ++ include/linux/pci_ids.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 4266b64631a4..7e331e8f3692 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -36,6 +36,7 @@ #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e #define PCI_DEVICE_ID_AMD_19H_M60H_DF_F4 0x14e4 #define PCI_DEVICE_ID_AMD_19H_M70H_DF_F4 0x14f4 +#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F4 0x12fc /* Protect the PCI config register pairs used for SMN. */ static DEFINE_MUTEX(smn_mutex); @@ -79,6 +80,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M60H_DF_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M70H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M78H_DF_F3) }, {} }; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 45c3d62e616d..95f33dadb2be 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -567,6 +567,7 @@ #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F3 0x166d #define PCI_DEVICE_ID_AMD_19H_M60H_DF_F3 0x14e3 #define PCI_DEVICE_ID_AMD_19H_M70H_DF_F3 0x14f3 +#define PCI_DEVICE_ID_AMD_19H_M78H_DF_F3 0x12fb #define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703 #define PCI_DEVICE_ID_AMD_LANCE 0x2000 #define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001 -- cgit From c00bc80462afc7963f449d7f21d896d2f629cacc Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 15 Apr 2023 20:23:34 +0200 Subject: power: supply: bq27xxx: Fix poll_interval handling and races on remove Before this patch bq27xxx_battery_teardown() was setting poll_interval = 0 to avoid bq27xxx_battery_update() requeuing the delayed_work item. There are 2 problems with this: 1. If the driver is unbound through sysfs, rather then the module being rmmod-ed, this changes poll_interval unexpectedly 2. This is racy, after it being set poll_interval could be changed before bq27xxx_battery_update() checks it through /sys/module/bq27xxx_battery/parameters/poll_interval Fix this by added a removed attribute to struct bq27xxx_device_info and using that instead of setting poll_interval to 0. There also is another poll_interval related race on remove(), writing /sys/module/bq27xxx_battery/parameters/poll_interval will requeue the delayed_work item for all devices on the bq27xxx_battery_devices list and the device being removed was only removed from that list after cancelling the delayed_work item. Fix this by moving the removal from the bq27xxx_battery_devices list to before cancelling the delayed_work item. Fixes: 8cfaaa811894 ("bq27x00_battery: Fix OOPS caused by unregistring bq27x00 driver") Signed-off-by: Hans de Goede Signed-off-by: Sebastian Reichel --- drivers/power/supply/bq27xxx_battery.c | 22 +++++++++------------- include/linux/power/bq27xxx_battery.h | 1 + 2 files changed, 10 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c index 79aedf83cc8d..7411f1cf741b 100644 --- a/drivers/power/supply/bq27xxx_battery.c +++ b/drivers/power/supply/bq27xxx_battery.c @@ -1801,7 +1801,7 @@ static void bq27xxx_battery_update_unlocked(struct bq27xxx_device_info *di) di->last_update = jiffies; - if (poll_interval > 0) + if (!di->removed && poll_interval > 0) mod_delayed_work(system_wq, &di->work, poll_interval * HZ); } @@ -2132,22 +2132,18 @@ EXPORT_SYMBOL_GPL(bq27xxx_battery_setup); void bq27xxx_battery_teardown(struct bq27xxx_device_info *di) { - /* - * power_supply_unregister call bq27xxx_battery_get_property which - * call bq27xxx_battery_poll. - * Make sure that bq27xxx_battery_poll will not call - * schedule_delayed_work again after unregister (which cause OOPS). - */ - poll_interval = 0; - - cancel_delayed_work_sync(&di->work); - - power_supply_unregister(di->bat); - mutex_lock(&bq27xxx_list_lock); list_del(&di->list); mutex_unlock(&bq27xxx_list_lock); + /* Set removed to avoid bq27xxx_battery_update() re-queuing the work */ + mutex_lock(&di->lock); + di->removed = true; + mutex_unlock(&di->lock); + + cancel_delayed_work_sync(&di->work); + + power_supply_unregister(di->bat); mutex_destroy(&di->lock); } EXPORT_SYMBOL_GPL(bq27xxx_battery_teardown); diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h index a1aa68141d0b..e3322dad9c85 100644 --- a/include/linux/power/bq27xxx_battery.h +++ b/include/linux/power/bq27xxx_battery.h @@ -68,6 +68,7 @@ struct bq27xxx_device_info { struct bq27xxx_access_methods bus; struct bq27xxx_reg_cache cache; int charge_design_full; + bool removed; unsigned long last_update; struct delayed_work work; struct power_supply *bat; -- cgit From 939a116142012926e25de0ea6b7e2f8d86a5f1b6 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sat, 15 Apr 2023 20:23:37 +0200 Subject: power: supply: bq27xxx: Ensure power_supply_changed() is called on current sign changes On gauges where the current register is signed, there is no charging flag in the flags register. So only checking flags will not result in power_supply_changed() getting called when e.g. a charger is plugged in and the current sign changes from negative (discharging) to positive (charging). This causes userspace's notion of the status to lag until userspace does a poll. And when a power_supply_leds.c LED trigger is used to indicate charging status with a LED, this LED will lag until the capacity percentage changes, which may take many minutes (because the LED trigger only is updated on power_supply_changed() calls). Fix this by calling bq27xxx_battery_current_and_status() on gauges with a signed current register and checking if the status has changed. Fixes: 297a533b3e62 ("bq27x00: Cache battery registers") Signed-off-by: Hans de Goede Signed-off-by: Sebastian Reichel --- drivers/power/supply/bq27xxx_battery.c | 13 ++++++++++++- include/linux/power/bq27xxx_battery.h | 3 +++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/power/supply/bq27xxx_battery.c b/drivers/power/supply/bq27xxx_battery.c index 8f2995e9850a..f98b51ce19b5 100644 --- a/drivers/power/supply/bq27xxx_battery.c +++ b/drivers/power/supply/bq27xxx_battery.c @@ -1836,6 +1836,7 @@ static int bq27xxx_battery_current_and_status( static void bq27xxx_battery_update_unlocked(struct bq27xxx_device_info *di) { + union power_supply_propval status = di->last_status; struct bq27xxx_reg_cache cache = {0, }; bool has_singe_flag = di->opts & BQ27XXX_O_ZERO; @@ -1860,14 +1861,24 @@ static void bq27xxx_battery_update_unlocked(struct bq27xxx_device_info *di) if (di->regs[BQ27XXX_REG_CYCT] != INVALID_REG_ADDR) cache.cycle_count = bq27xxx_battery_read_cyct(di); + /* + * On gauges with signed current reporting the current must be + * checked to detect charging <-> discharging status changes. + */ + if (!(di->opts & BQ27XXX_O_ZERO)) + bq27xxx_battery_current_and_status(di, NULL, &status, &cache); + /* We only have to read charge design full once */ if (di->charge_design_full <= 0) di->charge_design_full = bq27xxx_battery_read_dcap(di); } if ((di->cache.capacity != cache.capacity) || - (di->cache.flags != cache.flags)) + (di->cache.flags != cache.flags) || + (di->last_status.intval != status.intval)) { + di->last_status.intval = status.intval; power_supply_changed(di->bat); + } if (memcmp(&di->cache, &cache, sizeof(cache)) != 0) di->cache = cache; diff --git a/include/linux/power/bq27xxx_battery.h b/include/linux/power/bq27xxx_battery.h index e3322dad9c85..7c8d65414a70 100644 --- a/include/linux/power/bq27xxx_battery.h +++ b/include/linux/power/bq27xxx_battery.h @@ -2,6 +2,8 @@ #ifndef __LINUX_BQ27X00_BATTERY_H__ #define __LINUX_BQ27X00_BATTERY_H__ +#include + enum bq27xxx_chip { BQ27000 = 1, /* bq27000, bq27200 */ BQ27010, /* bq27010, bq27210 */ @@ -70,6 +72,7 @@ struct bq27xxx_device_info { int charge_design_full; bool removed; unsigned long last_update; + union power_supply_propval last_status; struct delayed_work work; struct power_supply *bat; struct list_head list; -- cgit From 19b8766459c41c6f318f8a548cc1c66dffd18363 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Thu, 20 Apr 2023 16:06:03 +0100 Subject: firmware: arm_ffa: Fix FFA device names for logical partitions Each physical partition can provide multiple services each with UUID. Each such service can be presented as logical partition with a unique combination of VM ID and UUID. The number of distinct UUID in a system will be less than or equal to the number of logical partitions. However, currently it fails to register more than one logical partition or service within a physical partition as the device name contains only VM ID while both VM ID and UUID are maintained in the partition information. The kernel complains with the below message: | sysfs: cannot create duplicate filename '/devices/arm-ffa-8001' | CPU: 1 PID: 1 Comm: swapper/0 Not tainted 6.3.0-rc7 #8 | Hardware name: FVP Base RevC (DT) | Call trace: | dump_backtrace+0xf8/0x118 | show_stack+0x18/0x24 | dump_stack_lvl+0x50/0x68 | dump_stack+0x18/0x24 | sysfs_create_dir_ns+0xe0/0x13c | kobject_add_internal+0x220/0x3d4 | kobject_add+0x94/0x100 | device_add+0x144/0x5d8 | device_register+0x20/0x30 | ffa_device_register+0x88/0xd8 | ffa_setup_partitions+0x108/0x1b8 | ffa_init+0x2ec/0x3a4 | do_one_initcall+0xcc/0x240 | do_initcall_level+0x8c/0xac | do_initcalls+0x54/0x94 | do_basic_setup+0x1c/0x28 | kernel_init_freeable+0x100/0x16c | kernel_init+0x20/0x1a0 | ret_from_fork+0x10/0x20 | kobject_add_internal failed for arm-ffa-8001 with -EEXIST, don't try to | register things with the same name in the same directory. | arm_ffa arm-ffa: unable to register device arm-ffa-8001 err=-17 | ARM FF-A: ffa_setup_partitions: failed to register partition ID 0x8001 By virtue of being random enough to avoid collisions when generated in a distributed system, there is no way to compress UUID keys to the number of bits required to identify each. We can eliminate '-' in the name but it is not worth eliminating 4 bytes and add unnecessary logic for doing that. Also v1.0 doesn't provide the UUID of the partitions which makes it hard to use the same for the device name. So to keep it simple, let us alloc an ID using ida_alloc() and append the same to "arm-ffa" to make up a unique device name. Also stash the id value in ffa_dev to help freeing the ID later when the device is destroyed. Fixes: e781858488b9 ("firmware: arm_ffa: Add initial FFA bus support for device enumeration") Reported-by: Lucian Paul-Trifu Link: https://lore.kernel.org/r/20230419-ffa_fixes_6-4-v2-3-d9108e43a176@arm.com Signed-off-by: Sudeep Holla --- drivers/firmware/arm_ffa/bus.c | 16 +++++++++++++--- include/linux/arm_ffa.h | 1 + 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/arm_ffa/bus.c b/drivers/firmware/arm_ffa/bus.c index 36bd5423c2f0..2b8bfcd010f5 100644 --- a/drivers/firmware/arm_ffa/bus.c +++ b/drivers/firmware/arm_ffa/bus.c @@ -15,6 +15,8 @@ #include "common.h" +static DEFINE_IDA(ffa_bus_id); + static int ffa_device_match(struct device *dev, struct device_driver *drv) { const struct ffa_device_id *id_table; @@ -131,6 +133,7 @@ static void ffa_release_device(struct device *dev) { struct ffa_device *ffa_dev = to_ffa_dev(dev); + ida_free(&ffa_bus_id, ffa_dev->id); kfree(ffa_dev); } @@ -171,18 +174,24 @@ bool ffa_device_is_valid(struct ffa_device *ffa_dev) struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, const struct ffa_ops *ops) { - int ret; + int id, ret; struct device *dev; struct ffa_device *ffa_dev; + id = ida_alloc_min(&ffa_bus_id, 1, GFP_KERNEL); + if (id < 0) + return NULL; + ffa_dev = kzalloc(sizeof(*ffa_dev), GFP_KERNEL); - if (!ffa_dev) + if (!ffa_dev) { + ida_free(&ffa_bus_id, id); return NULL; + } dev = &ffa_dev->dev; dev->bus = &ffa_bus_type; dev->release = ffa_release_device; - dev_set_name(&ffa_dev->dev, "arm-ffa-%04x", vm_id); + dev_set_name(&ffa_dev->dev, "arm-ffa-%d", id); ffa_dev->vm_id = vm_id; ffa_dev->ops = ops; @@ -218,4 +227,5 @@ void arm_ffa_bus_exit(void) { ffa_devices_unregister(); bus_unregister(&ffa_bus_type); + ida_destroy(&ffa_bus_id); } diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index c87aeecaa9b2..583fe3b49a49 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -96,6 +96,7 @@ /* FFA Bus/Device/Driver related */ struct ffa_device { + u32 id; int vm_id; bool mode_32bit; uuid_t uuid; -- cgit From 162bd18eb55adf464a0fa2b4144b8d61c75ff7c2 Mon Sep 17 00:00:00 2001 From: Roy Novich Date: Sun, 7 May 2023 16:57:43 +0300 Subject: linux/dim: Do nothing if no time delta between samples Add return value for dim_calc_stats. This is an indication for the caller if curr_stats was assigned by the function. Avoid using curr_stats uninitialized over {rdma/net}_dim, when no time delta between samples. Coverity reported this potential use of an uninitialized variable. Fixes: 4c4dbb4a7363 ("net/mlx5e: Move dynamic interrupt coalescing code to include/linux") Fixes: cb3c7fd4f839 ("net/mlx5e: Support adaptive RX coalescing") Signed-off-by: Roy Novich Reviewed-by: Aya Levin Reviewed-by: Saeed Mahameed Signed-off-by: Tariq Toukan Reviewed-by: Leon Romanovsky Reviewed-by: Michal Kubiak Link: https://lore.kernel.org/r/20230507135743.138993-1-tariqt@nvidia.com Signed-off-by: Paolo Abeni --- include/linux/dim.h | 3 ++- lib/dim/dim.c | 5 +++-- lib/dim/net_dim.c | 3 ++- lib/dim/rdma_dim.c | 3 ++- 4 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dim.h b/include/linux/dim.h index 6c5733981563..f343bc9aa2ec 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -236,8 +236,9 @@ void dim_park_tired(struct dim *dim); * * Calculate the delta between two samples (in data rates). * Takes into consideration counter wrap-around. + * Returned boolean indicates whether curr_stats are reliable. */ -void dim_calc_stats(struct dim_sample *start, struct dim_sample *end, +bool dim_calc_stats(struct dim_sample *start, struct dim_sample *end, struct dim_stats *curr_stats); /** diff --git a/lib/dim/dim.c b/lib/dim/dim.c index 38045d6d0538..e89aaf07bde5 100644 --- a/lib/dim/dim.c +++ b/lib/dim/dim.c @@ -54,7 +54,7 @@ void dim_park_tired(struct dim *dim) } EXPORT_SYMBOL(dim_park_tired); -void dim_calc_stats(struct dim_sample *start, struct dim_sample *end, +bool dim_calc_stats(struct dim_sample *start, struct dim_sample *end, struct dim_stats *curr_stats) { /* u32 holds up to 71 minutes, should be enough */ @@ -66,7 +66,7 @@ void dim_calc_stats(struct dim_sample *start, struct dim_sample *end, start->comp_ctr); if (!delta_us) - return; + return false; curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us); curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us); @@ -79,5 +79,6 @@ void dim_calc_stats(struct dim_sample *start, struct dim_sample *end, else curr_stats->cpe_ratio = 0; + return true; } EXPORT_SYMBOL(dim_calc_stats); diff --git a/lib/dim/net_dim.c b/lib/dim/net_dim.c index 53f6b9c6e936..4e32f7aaac86 100644 --- a/lib/dim/net_dim.c +++ b/lib/dim/net_dim.c @@ -227,7 +227,8 @@ void net_dim(struct dim *dim, struct dim_sample end_sample) dim->start_sample.event_ctr); if (nevents < DIM_NEVENTS) break; - dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats); + if (!dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats)) + break; if (net_dim_decision(&curr_stats, dim)) { dim->state = DIM_APPLY_NEW_PROFILE; schedule_work(&dim->work); diff --git a/lib/dim/rdma_dim.c b/lib/dim/rdma_dim.c index 15462d54758d..88f779486707 100644 --- a/lib/dim/rdma_dim.c +++ b/lib/dim/rdma_dim.c @@ -88,7 +88,8 @@ void rdma_dim(struct dim *dim, u64 completions) nevents = curr_sample->event_ctr - dim->start_sample.event_ctr; if (nevents < DIM_NEVENTS) break; - dim_calc_stats(&dim->start_sample, curr_sample, &curr_stats); + if (!dim_calc_stats(&dim->start_sample, curr_sample, &curr_stats)) + break; if (rdma_dim_decision(&curr_stats, dim)) { dim->state = DIM_APPLY_NEW_PROFILE; schedule_work(&dim->work); -- cgit From 293007b033418c8c9d1b35d68dec49a500750fde Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 8 May 2023 12:13:27 -0600 Subject: io_uring: make io_uring_sqe_cmd() unconditionally available If CONFIG_IO_URING isn't set, then io_uring_sqe_cmd() is not defined. As the nvme driver uses this helper, it causes a compilation issue: drivers/nvme/host/ioctl.c: In function 'nvme_uring_cmd_io': drivers/nvme/host/ioctl.c:555:44: error: implicit declaration of function 'io_uring_sqe_cmd'; did you mean 'io_uring_free'? [-Werror=implicit-function-declaration] 555 | const struct nvme_uring_cmd *cmd = io_uring_sqe_cmd(ioucmd->sqe); | ^~~~~~~~~~~~~~~~ | io_uring_free Fix it by just making io_uring_sqe_cmd() generally available - the types are known, and there's no reason to hide it under CONFIG_IO_URING. Fixes: fd9b8547bc5c ("io_uring: Pass whole sqe to commands") Reported-by: kernel test robot Reported-by: Chen-Yu Tsai Tested-by: Chen-Yu Tsai Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 3399d979ee1c..7fe31b2cd02f 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -36,6 +36,11 @@ struct io_uring_cmd { u8 pdu[32]; /* available inline for free use */ }; +static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) +{ + return sqe->cmd; +} + #if defined(CONFIG_IO_URING) int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd); @@ -66,11 +71,6 @@ static inline void io_uring_free(struct task_struct *tsk) if (tsk->io_uring) __io_uring_free(tsk); } - -static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) -{ - return sqe->cmd; -} #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd) -- cgit From 4063384ef762cc5946fc7a3f89879e76c6ec51e2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 9 May 2023 13:18:57 +0000 Subject: net: add vlan_get_protocol_and_depth() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before blamed commit, pskb_may_pull() was used instead of skb_header_pointer() in __vlan_get_protocol() and friends. Few callers depended on skb->head being populated with MAC header, syzbot caught one of them (skb_mac_gso_segment()) Add vlan_get_protocol_and_depth() to make the intent clearer and use it where sensible. This is a more generic fix than commit e9d3f80935b6 ("net/af_packet: make sure to pull mac header") which was dealing with a similar issue. kernel BUG at include/linux/skbuff.h:2655 ! invalid opcode: 0000 [#1] SMP KASAN CPU: 0 PID: 1441 Comm: syz-executor199 Not tainted 6.1.24-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/14/2023 RIP: 0010:__skb_pull include/linux/skbuff.h:2655 [inline] RIP: 0010:skb_mac_gso_segment+0x68f/0x6a0 net/core/gro.c:136 Code: fd 48 8b 5c 24 10 44 89 6b 70 48 c7 c7 c0 ae 0d 86 44 89 e6 e8 a1 91 d0 00 48 c7 c7 00 af 0d 86 48 89 de 31 d2 e8 d1 4a e9 ff <0f> 0b 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 41 RSP: 0018:ffffc90001bd7520 EFLAGS: 00010286 RAX: ffffffff8469736a RBX: ffff88810f31dac0 RCX: ffff888115a18b00 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffc90001bd75e8 R08: ffffffff84697183 R09: fffff5200037adf9 R10: 0000000000000000 R11: dffffc0000000001 R12: 0000000000000012 R13: 000000000000fee5 R14: 0000000000005865 R15: 000000000000fed7 FS: 000055555633f300(0000) GS:ffff8881f6a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000000 CR3: 0000000116fea000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: [] __skb_gso_segment+0x32d/0x4c0 net/core/dev.c:3419 [] skb_gso_segment include/linux/netdevice.h:4819 [inline] [] validate_xmit_skb+0x3aa/0xee0 net/core/dev.c:3725 [] __dev_queue_xmit+0x1332/0x3300 net/core/dev.c:4313 [] dev_queue_xmit+0x17/0x20 include/linux/netdevice.h:3029 [] packet_snd net/packet/af_packet.c:3111 [inline] [] packet_sendmsg+0x49d2/0x6470 net/packet/af_packet.c:3142 [] sock_sendmsg_nosec net/socket.c:716 [inline] [] sock_sendmsg net/socket.c:736 [inline] [] __sys_sendto+0x472/0x5f0 net/socket.c:2139 [] __do_sys_sendto net/socket.c:2151 [inline] [] __se_sys_sendto net/socket.c:2147 [inline] [] __x64_sys_sendto+0xe5/0x100 net/socket.c:2147 [] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [] do_syscall_64+0x2f/0x50 arch/x86/entry/common.c:80 [] entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 469aceddfa3e ("vlan: consolidate VLAN parsing code and limit max parsing depth") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Toke Høiland-Jørgensen Cc: Willem de Bruijn Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/tap.c | 4 ++-- include/linux/if_vlan.h | 17 +++++++++++++++++ net/bridge/br_forward.c | 2 +- net/core/dev.c | 2 +- net/packet/af_packet.c | 6 ++---- 5 files changed, 23 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/tap.c b/drivers/net/tap.c index ce993cc75bf3..d30d730ed5a7 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -742,7 +742,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, /* Move network header to the right position for VLAN tagged packets */ if (eth_type_vlan(skb->protocol) && - __vlan_get_protocol(skb, skb->protocol, &depth) != 0) + vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); /* copy skb_ubuf_info for callback when skb has no error */ @@ -1197,7 +1197,7 @@ static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp) /* Move network header to the right position for VLAN tagged packets */ if (eth_type_vlan(skb->protocol) && - __vlan_get_protocol(skb, skb->protocol, &depth) != 0) + vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); rcu_read_lock(); diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 0f40f379d75c..6ba71957851e 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -637,6 +637,23 @@ static inline __be16 vlan_get_protocol(const struct sk_buff *skb) return __vlan_get_protocol(skb, skb->protocol, NULL); } +/* This version of __vlan_get_protocol() also pulls mac header in skb->head */ +static inline __be16 vlan_get_protocol_and_depth(struct sk_buff *skb, + __be16 type, int *depth) +{ + int maclen; + + type = __vlan_get_protocol(skb, type, &maclen); + + if (type) { + if (!pskb_may_pull(skb, maclen)) + type = 0; + else if (depth) + *depth = maclen; + } + return type; +} + /* A getter for the SKB protocol field which will handle VLAN tags consistently * whether VLAN acceleration is enabled or not. */ diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 57744704ff69..84d6dd5e5b1a 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -42,7 +42,7 @@ int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb eth_type_vlan(skb->protocol)) { int depth; - if (!__vlan_get_protocol(skb, skb->protocol, &depth)) + if (!vlan_get_protocol_and_depth(skb, skb->protocol, &depth)) goto drop; skb_set_network_header(skb, depth); diff --git a/net/core/dev.c b/net/core/dev.c index 735096d42c1d..b3c13e041935 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3335,7 +3335,7 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) type = eth->h_proto; } - return __vlan_get_protocol(skb, type, depth); + return vlan_get_protocol_and_depth(skb, type, depth); } /* openvswitch calls this on rx path, so we need a different check. diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 640d94e34635..94c6a1ffa459 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1934,10 +1934,8 @@ static void packet_parse_headers(struct sk_buff *skb, struct socket *sock) /* Move network header to the right position for VLAN tagged packets */ if (likely(skb->dev->type == ARPHRD_ETHER) && eth_type_vlan(skb->protocol) && - __vlan_get_protocol(skb, skb->protocol, &depth) != 0) { - if (pskb_may_pull(skb, depth)) - skb_set_network_header(skb, depth); - } + vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) + skb_set_network_header(skb, depth); skb_probe_transport_header(skb); } -- cgit From 8432a15cd810c99db464b7e7858d559f3e1920e3 Mon Sep 17 00:00:00 2001 From: Jó Ágila Bitsch Date: Sat, 22 Apr 2023 18:05:32 +0200 Subject: usb: gadget: drop superfluous ':' in doc string MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There was one superfluous ':' that kernel-doc complained about. Reported-by: Randy Dunlap Closes: https://lore.kernel.org/all/c718a490-028d-2682-9ad7-8256d16504bf@infradead.org/ Fixes: fb6211f1584a ("usb: gadget: add doc to struct usb_composite_dev") Signed-off-by: Jó Ágila Bitsch Reviewed-by: Randy Dunlap Link: https://lore.kernel.org/r/ZEQFzMntIrwvZl4+@jo-einhundert Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/composite.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h index a2448e98854f..07531c4f4350 100644 --- a/include/linux/usb/composite.h +++ b/include/linux/usb/composite.h @@ -443,7 +443,7 @@ static inline struct usb_composite_driver *to_cdriver( * @bcd_webusb_version: 0x0100 by default, WebUSB specification version * @b_webusb_vendor_code: 0x0 by default, vendor code for WebUSB * @landing_page: empty by default, landing page to announce in WebUSB - * @use_webusb:: false by default, interested gadgets set it + * @use_webusb: false by default, interested gadgets set it * @os_desc_config: the configuration to be used with OS descriptors * @setup_pending: true when setup request is queued but not completed * @os_desc_pending: true when os_desc request is queued but not completed -- cgit From 948f072ada23e0a504c5e4d7d71d4c83bd0785ec Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 9 May 2023 09:42:47 +1000 Subject: SUNRPC: always free ctxt when freeing deferred request Since the ->xprt_ctxt pointer was added to svc_deferred_req, it has not been sufficient to use kfree() to free a deferred request. We may need to free the ctxt as well. As freeing the ctxt is all that ->xpo_release_rqst() does, we repurpose it to explicit do that even when the ctxt is not stored in an rqst. So we now have ->xpo_release_ctxt() which is given an xprt and a ctxt, which may have been taken either from an rqst or from a dreq. The caller is now responsible for clearing that pointer after the call to ->xpo_release_ctxt. We also clear dr->xprt_ctxt when the ctxt is moved into a new rqst when revisiting a deferred request. This ensures there is only one pointer to the ctxt, so the risk of double freeing in future is reduced. The new code in svc_xprt_release which releases both the ctxt and any rq_deferred depends on this. Fixes: 773f91b2cf3f ("SUNRPC: Fix NFSD's request deferral on RDMA transports") Signed-off-by: NeilBrown Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 2 +- include/linux/sunrpc/svc_xprt.h | 2 +- net/sunrpc/svc_xprt.c | 23 +++++++++++++++++------ net/sunrpc/svcsock.c | 30 ++++++++++++++++-------------- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 11 +++++------ net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 +- 6 files changed, 41 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 24aa159d29a7..fbc4bd423b35 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -176,7 +176,7 @@ extern struct svc_rdma_recv_ctxt * extern void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *ctxt); extern void svc_rdma_flush_recv_queues(struct svcxprt_rdma *rdma); -extern void svc_rdma_release_rqst(struct svc_rqst *rqstp); +extern void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *ctxt); extern int svc_rdma_recvfrom(struct svc_rqst *); /* svc_rdma_rw.c */ diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 867479204840..a6b12631db21 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -23,7 +23,7 @@ struct svc_xprt_ops { int (*xpo_sendto)(struct svc_rqst *); int (*xpo_result_payload)(struct svc_rqst *, unsigned int, unsigned int); - void (*xpo_release_rqst)(struct svc_rqst *); + void (*xpo_release_ctxt)(struct svc_xprt *xprt, void *ctxt); void (*xpo_detach)(struct svc_xprt *); void (*xpo_free)(struct svc_xprt *); void (*xpo_kill_temp_xprt)(struct svc_xprt *); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 5fd94f6bdc75..13a14897bc17 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -532,13 +532,23 @@ void svc_reserve(struct svc_rqst *rqstp, int space) } EXPORT_SYMBOL_GPL(svc_reserve); +static void free_deferred(struct svc_xprt *xprt, struct svc_deferred_req *dr) +{ + if (!dr) + return; + + xprt->xpt_ops->xpo_release_ctxt(xprt, dr->xprt_ctxt); + kfree(dr); +} + static void svc_xprt_release(struct svc_rqst *rqstp) { struct svc_xprt *xprt = rqstp->rq_xprt; - xprt->xpt_ops->xpo_release_rqst(rqstp); + xprt->xpt_ops->xpo_release_ctxt(xprt, rqstp->rq_xprt_ctxt); + rqstp->rq_xprt_ctxt = NULL; - kfree(rqstp->rq_deferred); + free_deferred(xprt, rqstp->rq_deferred); rqstp->rq_deferred = NULL; svc_rqst_release_pages(rqstp); @@ -1054,7 +1064,7 @@ static void svc_delete_xprt(struct svc_xprt *xprt) spin_unlock_bh(&serv->sv_lock); while ((dr = svc_deferred_dequeue(xprt)) != NULL) - kfree(dr); + free_deferred(xprt, dr); call_xpt_users(xprt); svc_xprt_put(xprt); @@ -1176,8 +1186,8 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many) if (too_many || test_bit(XPT_DEAD, &xprt->xpt_flags)) { spin_unlock(&xprt->xpt_lock); trace_svc_defer_drop(dr); + free_deferred(xprt, dr); svc_xprt_put(xprt); - kfree(dr); return; } dr->xprt = NULL; @@ -1222,14 +1232,13 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req) dr->addrlen = rqstp->rq_addrlen; dr->daddr = rqstp->rq_daddr; dr->argslen = rqstp->rq_arg.len >> 2; - dr->xprt_ctxt = rqstp->rq_xprt_ctxt; /* back up head to the start of the buffer and copy */ skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip, dr->argslen << 2); } - WARN_ON_ONCE(rqstp->rq_xprt_ctxt != dr->xprt_ctxt); + dr->xprt_ctxt = rqstp->rq_xprt_ctxt; rqstp->rq_xprt_ctxt = NULL; trace_svc_defer(rqstp); svc_xprt_get(rqstp->rq_xprt); @@ -1263,6 +1272,8 @@ static noinline int svc_deferred_recv(struct svc_rqst *rqstp) rqstp->rq_daddr = dr->daddr; rqstp->rq_respages = rqstp->rq_pages; rqstp->rq_xprt_ctxt = dr->xprt_ctxt; + + dr->xprt_ctxt = NULL; svc_xprt_received(rqstp->rq_xprt); return dr->argslen << 2; } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 9989194446a5..63fe7a338992 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -121,27 +121,27 @@ static void svc_reclassify_socket(struct socket *sock) #endif /** - * svc_tcp_release_rqst - Release transport-related resources - * @rqstp: request structure with resources to be released + * svc_tcp_release_ctxt - Release transport-related resources + * @xprt: the transport which owned the context + * @ctxt: the context from rqstp->rq_xprt_ctxt or dr->xprt_ctxt * */ -static void svc_tcp_release_rqst(struct svc_rqst *rqstp) +static void svc_tcp_release_ctxt(struct svc_xprt *xprt, void *ctxt) { } /** - * svc_udp_release_rqst - Release transport-related resources - * @rqstp: request structure with resources to be released + * svc_udp_release_ctxt - Release transport-related resources + * @xprt: the transport which owned the context + * @ctxt: the context from rqstp->rq_xprt_ctxt or dr->xprt_ctxt * */ -static void svc_udp_release_rqst(struct svc_rqst *rqstp) +static void svc_udp_release_ctxt(struct svc_xprt *xprt, void *ctxt) { - struct sk_buff *skb = rqstp->rq_xprt_ctxt; + struct sk_buff *skb = ctxt; - if (skb) { - rqstp->rq_xprt_ctxt = NULL; + if (skb) consume_skb(skb); - } } union svc_pktinfo_u { @@ -696,7 +696,8 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) unsigned int sent; int err; - svc_udp_release_rqst(rqstp); + svc_udp_release_ctxt(xprt, rqstp->rq_xprt_ctxt); + rqstp->rq_xprt_ctxt = NULL; svc_set_cmsg_data(rqstp, cmh); @@ -768,7 +769,7 @@ static const struct svc_xprt_ops svc_udp_ops = { .xpo_recvfrom = svc_udp_recvfrom, .xpo_sendto = svc_udp_sendto, .xpo_result_payload = svc_sock_result_payload, - .xpo_release_rqst = svc_udp_release_rqst, + .xpo_release_ctxt = svc_udp_release_ctxt, .xpo_detach = svc_sock_detach, .xpo_free = svc_sock_free, .xpo_has_wspace = svc_udp_has_wspace, @@ -1301,7 +1302,8 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) unsigned int sent; int err; - svc_tcp_release_rqst(rqstp); + svc_tcp_release_ctxt(xprt, rqstp->rq_xprt_ctxt); + rqstp->rq_xprt_ctxt = NULL; atomic_inc(&svsk->sk_sendqlen); mutex_lock(&xprt->xpt_mutex); @@ -1346,7 +1348,7 @@ static const struct svc_xprt_ops svc_tcp_ops = { .xpo_recvfrom = svc_tcp_recvfrom, .xpo_sendto = svc_tcp_sendto, .xpo_result_payload = svc_sock_result_payload, - .xpo_release_rqst = svc_tcp_release_rqst, + .xpo_release_ctxt = svc_tcp_release_ctxt, .xpo_detach = svc_tcp_sock_detach, .xpo_free = svc_sock_free, .xpo_has_wspace = svc_tcp_has_wspace, diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 1c658fa43063..a22fe7587fa6 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -239,21 +239,20 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma, } /** - * svc_rdma_release_rqst - Release transport-specific per-rqst resources - * @rqstp: svc_rqst being released + * svc_rdma_release_ctxt - Release transport-specific per-rqst resources + * @xprt: the transport which owned the context + * @vctxt: the context from rqstp->rq_xprt_ctxt or dr->xprt_ctxt * * Ensure that the recv_ctxt is released whether or not a Reply * was sent. For example, the client could close the connection, * or svc_process could drop an RPC, before the Reply is sent. */ -void svc_rdma_release_rqst(struct svc_rqst *rqstp) +void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *vctxt) { - struct svc_rdma_recv_ctxt *ctxt = rqstp->rq_xprt_ctxt; - struct svc_xprt *xprt = rqstp->rq_xprt; + struct svc_rdma_recv_ctxt *ctxt = vctxt; struct svcxprt_rdma *rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); - rqstp->rq_xprt_ctxt = NULL; if (ctxt) svc_rdma_recv_ctxt_put(rdma, ctxt); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 416b298f74dd..ca04f7a6a085 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -80,7 +80,7 @@ static const struct svc_xprt_ops svc_rdma_ops = { .xpo_recvfrom = svc_rdma_recvfrom, .xpo_sendto = svc_rdma_sendto, .xpo_result_payload = svc_rdma_result_payload, - .xpo_release_rqst = svc_rdma_release_rqst, + .xpo_release_ctxt = svc_rdma_release_ctxt, .xpo_detach = svc_rdma_detach, .xpo_free = svc_rdma_free, .xpo_has_wspace = svc_rdma_has_wspace, -- cgit From 99d46450625590d410f86fe4660a5eff7d3b8343 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Wed, 26 Apr 2023 20:29:28 +0300 Subject: tpm: Prevent hwrng from activating during resume Set TPM_CHIP_FLAG_SUSPENDED in tpm_pm_suspend() and reset in tpm_pm_resume(). While the flag is set, tpm_hwrng() gives back zero bytes. This prevents hwrng from racing during resume. Cc: stable@vger.kernel.org Fixes: 6e592a065d51 ("tpm: Move Linux RNG connection to hwrng") Reviewed-by: Jerry Snitselaar Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm-chip.c | 4 ++++ drivers/char/tpm/tpm-interface.c | 10 ++++++++++ include/linux/tpm.h | 1 + 3 files changed, 15 insertions(+) (limited to 'include/linux') diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c index c10a4aa97373..cd48033b804a 100644 --- a/drivers/char/tpm/tpm-chip.c +++ b/drivers/char/tpm/tpm-chip.c @@ -571,6 +571,10 @@ static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait) { struct tpm_chip *chip = container_of(rng, struct tpm_chip, hwrng); + /* Give back zero bytes, as TPM chip has not yet fully resumed: */ + if (chip->flags & TPM_CHIP_FLAG_SUSPENDED) + return 0; + return tpm_get_random(chip, data, max); } diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c index 4463d0018290..586ca10b0d72 100644 --- a/drivers/char/tpm/tpm-interface.c +++ b/drivers/char/tpm/tpm-interface.c @@ -412,6 +412,8 @@ int tpm_pm_suspend(struct device *dev) } suspended: + chip->flags |= TPM_CHIP_FLAG_SUSPENDED; + if (rc) dev_err(dev, "Ignoring error %d while suspending\n", rc); return 0; @@ -429,6 +431,14 @@ int tpm_pm_resume(struct device *dev) if (chip == NULL) return -ENODEV; + chip->flags &= ~TPM_CHIP_FLAG_SUSPENDED; + + /* + * Guarantee that SUSPENDED is written last, so that hwrng does not + * activate before the chip has been fully resumed. + */ + wmb(); + return 0; } EXPORT_SYMBOL_GPL(tpm_pm_resume); diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 77693389c3f9..6a1e8f157255 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -282,6 +282,7 @@ enum tpm_chip_flags { TPM_CHIP_FLAG_ALWAYS_POWERED = BIT(5), TPM_CHIP_FLAG_FIRMWARE_POWER_MANAGED = BIT(6), TPM_CHIP_FLAG_FIRMWARE_UPGRADE = BIT(7), + TPM_CHIP_FLAG_SUSPENDED = BIT(8), }; #define to_tpm_chip(d) container_of(d, struct tpm_chip, dev) -- cgit From f15afbd34d8fadbd375f1212e97837e32bc170cc Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Mon, 24 Apr 2023 13:18:35 +0800 Subject: fs: fix undefined behavior in bit shift for SB_NOUSER Shifting signed 32-bit value by 31 bits is undefined, so changing significant bit to unsigned. It was spotted by UBSAN. So let's just fix this by using the BIT() helper for all SB_* flags. Fixes: e462ec50cb5f ("VFS: Differentiate mount flags (MS_*) from internal superblock flags") Signed-off-by: Hao Ge Message-Id: <20230424051835.374204-1-gehao@kylinos.cn> [brauner@kernel.org: use BIT() for all SB_* flags] Signed-off-by: Christian Brauner --- include/linux/fs.h | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 21a981680856..133f0640fb24 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1076,29 +1076,29 @@ extern int send_sigurg(struct fown_struct *fown); * sb->s_flags. Note that these mirror the equivalent MS_* flags where * represented in both. */ -#define SB_RDONLY 1 /* Mount read-only */ -#define SB_NOSUID 2 /* Ignore suid and sgid bits */ -#define SB_NODEV 4 /* Disallow access to device special files */ -#define SB_NOEXEC 8 /* Disallow program execution */ -#define SB_SYNCHRONOUS 16 /* Writes are synced at once */ -#define SB_MANDLOCK 64 /* Allow mandatory locks on an FS */ -#define SB_DIRSYNC 128 /* Directory modifications are synchronous */ -#define SB_NOATIME 1024 /* Do not update access times. */ -#define SB_NODIRATIME 2048 /* Do not update directory access times */ -#define SB_SILENT 32768 -#define SB_POSIXACL (1<<16) /* VFS does not apply the umask */ -#define SB_INLINECRYPT (1<<17) /* Use blk-crypto for encrypted files */ -#define SB_KERNMOUNT (1<<22) /* this is a kern_mount call */ -#define SB_I_VERSION (1<<23) /* Update inode I_version field */ -#define SB_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */ +#define SB_RDONLY BIT(0) /* Mount read-only */ +#define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */ +#define SB_NODEV BIT(2) /* Disallow access to device special files */ +#define SB_NOEXEC BIT(3) /* Disallow program execution */ +#define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */ +#define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */ +#define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */ +#define SB_NOATIME BIT(10) /* Do not update access times. */ +#define SB_NODIRATIME BIT(11) /* Do not update directory access times */ +#define SB_SILENT BIT(15) +#define SB_POSIXACL BIT(16) /* VFS does not apply the umask */ +#define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */ +#define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */ +#define SB_I_VERSION BIT(23) /* Update inode I_version field */ +#define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ -#define SB_SUBMOUNT (1<<26) -#define SB_FORCE (1<<27) -#define SB_NOSEC (1<<28) -#define SB_BORN (1<<29) -#define SB_ACTIVE (1<<30) -#define SB_NOUSER (1<<31) +#define SB_SUBMOUNT BIT(26) +#define SB_FORCE BIT(27) +#define SB_NOSEC BIT(28) +#define SB_BORN BIT(29) +#define SB_ACTIVE BIT(30) +#define SB_NOUSER BIT(31) /* These flags relate to encoding and casefolding */ #define SB_ENC_STRICT_MODE_FL (1 << 0) -- cgit From a18ef64fe1e4558b14a6e0ca9fbe8264475b7013 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 May 2023 14:47:12 +0200 Subject: tracing: make ftrace_likely_update() declaration visible This function is only used when CONFIG_TRACE_BRANCH_PROFILING is set and DISABLE_BRANCH_PROFILING is not set, and the declaration is hidden behind this combination of tests. But that causes a warning when building with CONFIG_TRACING_BRANCHES, since that sets DISABLE_BRANCH_PROFILING for the tracing code, and the declaration is thus hidden: kernel/trace/trace_branch.c:205:6: error: no previous prototype for 'ftrace_likely_update' [-Werror=missing-prototypes] Move the declaration out of the #ifdef to avoid the warning. Signed-off-by: Arnd Bergmann Signed-off-by: Linus Torvalds --- include/linux/compiler.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 947a60b801db..d7779a18b24f 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -12,11 +12,10 @@ * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code * to disable branch tracing on a per file basis. */ -#if defined(CONFIG_TRACE_BRANCH_PROFILING) \ - && !defined(DISABLE_BRANCH_PROFILING) && !defined(__CHECKER__) void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect, int is_constant); - +#if defined(CONFIG_TRACE_BRANCH_PROFILING) \ + && !defined(DISABLE_BRANCH_PROFILING) && !defined(__CHECKER__) #define likely_notrace(x) __builtin_expect(!!(x), 1) #define unlikely_notrace(x) __builtin_expect(!!(x), 0) -- cgit From 26e239b37ebdfd189a2e6f94d3407e70313348bc Mon Sep 17 00:00:00 2001 From: Joan Bruguera Micó Date: Wed, 3 May 2023 01:32:32 +0000 Subject: mm: shrinkers: fix race condition on debugfs cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When something registers and unregisters many shrinkers, such as: for x in $(seq 10000); do unshare -Ui true; done Sometimes the following error is printed to the kernel log: debugfs: Directory '...' with parent 'shrinker' already present! This occurs since commit badc28d4924b ("mm: shrinkers: fix deadlock in shrinker debugfs") / v6.2: Since the call to `debugfs_remove_recursive` was moved outside the `shrinker_rwsem`/`shrinker_mutex` lock, but the call to `ida_free` stayed inside, a newly registered shrinker can be re-assigned that ID and attempt to create the debugfs directory before the directory from the previous shrinker has been removed. The locking changes in commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless") made the race condition more likely, though it existed before then. Commit badc28d4924b ("mm: shrinkers: fix deadlock in shrinker debugfs") could be reverted since the issue is addressed should no longer occur since the count and scan operations are lockless since commit 20cd1892fcc3 ("mm: shrinkers: make count and scan in shrinker debugfs lockless"). However, since this is a contended lock, prefer instead moving `ida_free` outside the lock to avoid the race. Link: https://lkml.kernel.org/r/20230503013232.299211-1-joanbrugueram@gmail.com Fixes: badc28d4924b ("mm: shrinkers: fix deadlock in shrinker debugfs") Signed-off-by: Joan Bruguera Micó Cc: Qi Zheng Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/shrinker.h | 13 +++++++++++-- mm/shrinker_debug.c | 15 ++++++++++----- mm/vmscan.c | 5 +++-- 3 files changed, 24 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 7bde8e1c228a..224293b2dd06 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -107,7 +107,10 @@ extern void synchronize_shrinkers(void); #ifdef CONFIG_SHRINKER_DEBUG extern int shrinker_debugfs_add(struct shrinker *shrinker); -extern struct dentry *shrinker_debugfs_remove(struct shrinker *shrinker); +extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, + int *debugfs_id); +extern void shrinker_debugfs_remove(struct dentry *debugfs_entry, + int debugfs_id); extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...); #else /* CONFIG_SHRINKER_DEBUG */ @@ -115,10 +118,16 @@ static inline int shrinker_debugfs_add(struct shrinker *shrinker) { return 0; } -static inline struct dentry *shrinker_debugfs_remove(struct shrinker *shrinker) +static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, + int *debugfs_id) { + *debugfs_id = -1; return NULL; } +static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, + int debugfs_id) +{ +} static inline __printf(2, 3) int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) { diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 3f83b10c5031..fe10436d9911 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -237,7 +237,8 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) } EXPORT_SYMBOL(shrinker_debugfs_rename); -struct dentry *shrinker_debugfs_remove(struct shrinker *shrinker) +struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, + int *debugfs_id) { struct dentry *entry = shrinker->debugfs_entry; @@ -246,14 +247,18 @@ struct dentry *shrinker_debugfs_remove(struct shrinker *shrinker) kfree_const(shrinker->name); shrinker->name = NULL; - if (entry) { - ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id); - shrinker->debugfs_entry = NULL; - } + *debugfs_id = entry ? shrinker->debugfs_id : -1; + shrinker->debugfs_entry = NULL; return entry; } +void shrinker_debugfs_remove(struct dentry *debugfs_entry, int debugfs_id) +{ + debugfs_remove_recursive(debugfs_entry); + ida_free(&shrinker_debugfs_ida, debugfs_id); +} + static int __init shrinker_debugfs_init(void) { struct shrinker *shrinker; diff --git a/mm/vmscan.c b/mm/vmscan.c index d257916f39e5..6d0cd2840cf0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -805,6 +805,7 @@ EXPORT_SYMBOL(register_shrinker); void unregister_shrinker(struct shrinker *shrinker) { struct dentry *debugfs_entry; + int debugfs_id; if (!(shrinker->flags & SHRINKER_REGISTERED)) return; @@ -814,13 +815,13 @@ void unregister_shrinker(struct shrinker *shrinker) shrinker->flags &= ~SHRINKER_REGISTERED; if (shrinker->flags & SHRINKER_MEMCG_AWARE) unregister_memcg_shrinker(shrinker); - debugfs_entry = shrinker_debugfs_remove(shrinker); + debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); mutex_unlock(&shrinker_mutex); atomic_inc(&shrinker_srcu_generation); synchronize_srcu(&shrinker_srcu); - debugfs_remove_recursive(debugfs_entry); + shrinker_debugfs_remove(debugfs_entry, debugfs_id); kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; -- cgit From 2e9f8ab68f42b059e80db71266c1675c07c664bd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 May 2023 21:45:36 +0200 Subject: mdio_bus: unhide mdio_bus_init prototype mdio_bus_init() is either used as a local module_init() entry, or it gets called in phy_device.c. In the former case, there is no declaration, which causes a warning: drivers/net/phy/mdio_bus.c:1371:12: error: no previous prototype for 'mdio_bus_init' [-Werror=missing-prototypes] Remove the #ifdef around the declaration to avoid the warning.. Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20230516194625.549249-4-arnd@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index c5a0dc829714..6478838405a0 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1900,10 +1900,8 @@ void phy_package_leave(struct phy_device *phydev); int devm_phy_package_join(struct device *dev, struct phy_device *phydev, int addr, size_t priv_size); -#if IS_ENABLED(CONFIG_PHYLIB) int __init mdio_bus_init(void); void mdio_bus_exit(void); -#endif int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data); int phy_ethtool_get_sset_count(struct phy_device *phydev); -- cgit From 14c4be92ebb3e36e392aa9dd8f314038a9f96f3c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 May 2023 18:50:38 -0700 Subject: tls: rx: strp: force mixed decrypted records into copy mode If a record is partially decrypted we'll have to CoW it, anyway, so go into copy mode and allocate a writable skb right away. This will make subsequent fix simpler because we won't have to teach tls_strp_msg_make_copy() how to copy skbs while preserving decrypt status. Tested-by: Shai Amiram Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- include/linux/skbuff.h | 10 ++++++++++ net/tls/tls_strp.c | 16 +++++++++++----- 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 738776ab8838..0b40417457cd 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1587,6 +1587,16 @@ static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) to->l4_hash = from->l4_hash; }; +static inline int skb_cmp_decrypted(const struct sk_buff *skb1, + const struct sk_buff *skb2) +{ +#ifdef CONFIG_TLS_DEVICE + return skb2->decrypted - skb1->decrypted; +#else + return 0; +#endif +} + static inline void skb_copy_decrypted(struct sk_buff *to, const struct sk_buff *from) { diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index 24016c865e00..2b6fa9855999 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -317,15 +317,19 @@ static int tls_strp_read_copy(struct tls_strparser *strp, bool qshort) return 0; } -static bool tls_strp_check_no_dup(struct tls_strparser *strp) +static bool tls_strp_check_queue_ok(struct tls_strparser *strp) { unsigned int len = strp->stm.offset + strp->stm.full_len; - struct sk_buff *skb; + struct sk_buff *first, *skb; u32 seq; - skb = skb_shinfo(strp->anchor)->frag_list; - seq = TCP_SKB_CB(skb)->seq; + first = skb_shinfo(strp->anchor)->frag_list; + skb = first; + seq = TCP_SKB_CB(first)->seq; + /* Make sure there's no duplicate data in the queue, + * and the decrypted status matches. + */ while (skb->len < len) { seq += skb->len; len -= skb->len; @@ -333,6 +337,8 @@ static bool tls_strp_check_no_dup(struct tls_strparser *strp) if (TCP_SKB_CB(skb)->seq != seq) return false; + if (skb_cmp_decrypted(first, skb)) + return false; } return true; @@ -413,7 +419,7 @@ static int tls_strp_read_sock(struct tls_strparser *strp) return tls_strp_read_copy(strp, true); } - if (!tls_strp_check_no_dup(strp)) + if (!tls_strp_check_queue_ok(strp)) return tls_strp_read_copy(strp, false); strp->msg_ready = 1; -- cgit From ddaf098ea779b3c1302c7843f6ff01e89b1fd380 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 16 May 2023 21:20:14 +0200 Subject: driver core: class: properly reference count class_dev_iter() When class_dev_iter is initialized, the reference count for the subsys private structure is incremented, but never decremented, causing a memory leak over time. To resolve this, save off a pointer to the internal structure into the class_dev_iter structure and then when the iterator is finished, drop the reference count. Reported-and-tested-by: syzbot+e7afd76ad060fa0d2605@syzkaller.appspotmail.com Fixes: 7b884b7f24b4 ("driver core: class.c: convert to only use class_to_subsys") Reported-by: Mirsad Goran Todorovac Cc: Alan Stern Acked-by: Rafael J. Wysocki Tested-by: Mirsad Goran Todorovac Link: https://lore.kernel.org/r/2023051610-stove-condense-9a77@gregkh Signed-off-by: Greg Kroah-Hartman --- drivers/base/class.c | 2 ++ include/linux/device/class.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include/linux') diff --git a/drivers/base/class.c b/drivers/base/class.c index ac1808d1a2e8..05d9df90f621 100644 --- a/drivers/base/class.c +++ b/drivers/base/class.c @@ -320,6 +320,7 @@ void class_dev_iter_init(struct class_dev_iter *iter, const struct class *class, start_knode = &start->p->knode_class; klist_iter_init_node(&sp->klist_devices, &iter->ki, start_knode); iter->type = type; + iter->sp = sp; } EXPORT_SYMBOL_GPL(class_dev_iter_init); @@ -361,6 +362,7 @@ EXPORT_SYMBOL_GPL(class_dev_iter_next); void class_dev_iter_exit(struct class_dev_iter *iter) { klist_iter_exit(&iter->ki); + subsys_put(iter->sp); } EXPORT_SYMBOL_GPL(class_dev_iter_exit); diff --git a/include/linux/device/class.h b/include/linux/device/class.h index 9deeaeb457bb..abf3d3bfb6fe 100644 --- a/include/linux/device/class.h +++ b/include/linux/device/class.h @@ -74,6 +74,7 @@ struct class { struct class_dev_iter { struct klist_iter ki; const struct device_type *type; + struct subsys_private *sp; }; int __must_check class_register(const struct class *class); -- cgit From ae9b15fbe63447bc1d3bba3769f409d17ca6fdf6 Mon Sep 17 00:00:00 2001 From: Taehee Yoo Date: Wed, 17 May 2023 14:30:10 +0000 Subject: net: fix stack overflow when LRO is disabled for virtual interfaces When the virtual interface's feature is updated, it synchronizes the updated feature for its own lower interface. This propagation logic should be worked as the iteration, not recursively. But it works recursively due to the netdev notification unexpectedly. This problem occurs when it disables LRO only for the team and bonding interface type. team0 | +------+------+-----+-----+ | | | | | team1 team2 team3 ... team200 If team0's LRO feature is updated, it generates the NETDEV_FEAT_CHANGE event to its own lower interfaces(team1 ~ team200). It is worked by netdev_sync_lower_features(). So, the NETDEV_FEAT_CHANGE notification logic of each lower interface work iteratively. But generated NETDEV_FEAT_CHANGE event is also sent to the upper interface too. upper interface(team0) generates the NETDEV_FEAT_CHANGE event for its own lower interfaces again. lower and upper interfaces receive this event and generate this event again and again. So, the stack overflow occurs. But it is not the infinite loop issue. Because the netdev_sync_lower_features() updates features before generating the NETDEV_FEAT_CHANGE event. Already synchronized lower interfaces skip notification logic. So, it is just the problem that iteration logic is changed to the recursive unexpectedly due to the notification mechanism. Reproducer: ip link add team0 type team ethtool -K team0 lro on for i in {1..200} do ip link add team$i master team0 type team ethtool -K team$i lro on done ethtool -K team0 lro off In order to fix it, the notifier_ctx member of bonding/team is introduced. Reported-by: syzbot+60748c96cf5c6df8e581@syzkaller.appspotmail.com Fixes: fd867d51f889 ("net/core: generic support for disabling netdev features down stack") Signed-off-by: Taehee Yoo Reviewed-by: Eric Dumazet Reviewed-by: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230517143010.3596250-1-ap420073@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 8 +++++++- drivers/net/team/team.c | 7 ++++++- include/linux/if_team.h | 1 + include/net/bonding.h | 1 + 4 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 3fed888629f7..edbaa1444f8e 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3947,7 +3947,11 @@ static int bond_slave_netdev_event(unsigned long event, unblock_netpoll_tx(); break; case NETDEV_FEAT_CHANGE: - bond_compute_features(bond); + if (!bond->notifier_ctx) { + bond->notifier_ctx = true; + bond_compute_features(bond); + bond->notifier_ctx = false; + } break; case NETDEV_RESEND_IGMP: /* Propagate to master device */ @@ -6342,6 +6346,8 @@ static int bond_init(struct net_device *bond_dev) if (!bond->wq) return -ENOMEM; + bond->notifier_ctx = false; + spin_lock_init(&bond->stats_lock); netdev_lockdep_set_classes(bond_dev); diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index d10606f257c4..555b0b1e9a78 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1629,6 +1629,7 @@ static int team_init(struct net_device *dev) team->dev = dev; team_set_no_mode(team); + team->notifier_ctx = false; team->pcpu_stats = netdev_alloc_pcpu_stats(struct team_pcpu_stats); if (!team->pcpu_stats) @@ -3022,7 +3023,11 @@ static int team_device_event(struct notifier_block *unused, team_del_slave(port->team->dev, dev); break; case NETDEV_FEAT_CHANGE: - team_compute_features(port->team); + if (!port->team->notifier_ctx) { + port->team->notifier_ctx = true; + team_compute_features(port->team); + port->team->notifier_ctx = false; + } break; case NETDEV_PRECHANGEMTU: /* Forbid to change mtu of underlaying device */ diff --git a/include/linux/if_team.h b/include/linux/if_team.h index fc985e5c739d..8de6b6e67829 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -208,6 +208,7 @@ struct team { bool queue_override_enabled; struct list_head *qom_lists; /* array of queue override mapping lists */ bool port_mtu_change_allowed; + bool notifier_ctx; struct { unsigned int count; unsigned int interval; /* in ms */ diff --git a/include/net/bonding.h b/include/net/bonding.h index 0efef2a952b7..59955ac33157 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -221,6 +221,7 @@ struct bonding { struct bond_up_slave __rcu *usable_slaves; struct bond_up_slave __rcu *all_slaves; bool force_primary; + bool notifier_ctx; s32 slave_cnt; /* never change this value outside the attach/detach wrappers */ int (*recv_probe)(const struct sk_buff *, struct bonding *, struct slave *); -- cgit From e3afec91aad23c52dfe426c7d7128e4839c3eed8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 20 May 2023 11:00:10 +0200 Subject: block: remove NFL4_UFLG_MASK The NFL4_UFLG_MASK define slipped in in commit 9208d4149758 ("block: add a ->get_unique_id method") and should never have been added, as NFSD as the only user of it already has it's copy. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230520090010.527046-1-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b441e633f4dd..c0ffe203a602 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1376,8 +1376,6 @@ enum blk_unique_id { BLK_UID_NAA = 3, }; -#define NFL4_UFLG_MASK 0x0000003F - struct block_device_operations { void (*submit_bio)(struct bio *bio); int (*poll_bio)(struct bio *bio, struct io_comp_batch *iob, -- cgit From c7dd225bc224726c22db08e680bf787f60ebdee3 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Sun, 2 Apr 2023 17:14:10 +0300 Subject: net/mlx5: DR, Check force-loopback RC QP capability independently from RoCE SW Steering uses RC QP for writing STEs to ICM. This writingis done in LB (loopback), and FL (force-loopback) QP is preferred for performance. FL is available when RoCE is enabled or disabled based on RoCE caps. This patch adds reading of FL capability from HCA caps in addition to the existing reading from RoCE caps, thus fixing the case where we didn't have loopback enabled when RoCE was disabled. Fixes: 7304d603a57a ("net/mlx5: DR, Add support for force-loopback QP") Signed-off-by: Itamar Gozlan Signed-off-by: Yevgeny Kliteynik Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c | 4 +++- include/linux/mlx5/mlx5_ifc.h | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c index 3835ba3f4dda..1aa525e509f1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c @@ -117,6 +117,8 @@ int mlx5dr_cmd_query_device(struct mlx5_core_dev *mdev, caps->gvmi = MLX5_CAP_GEN(mdev, vhca_id); caps->flex_protocols = MLX5_CAP_GEN(mdev, flex_parser_protocols); caps->sw_format_ver = MLX5_CAP_GEN(mdev, steering_format_version); + caps->roce_caps.fl_rc_qp_when_roce_disabled = + MLX5_CAP_GEN(mdev, fl_rc_qp_when_roce_disabled); if (MLX5_CAP_GEN(mdev, roce)) { err = dr_cmd_query_nic_vport_roce_en(mdev, 0, &roce_en); @@ -124,7 +126,7 @@ int mlx5dr_cmd_query_device(struct mlx5_core_dev *mdev, return err; caps->roce_caps.roce_en = roce_en; - caps->roce_caps.fl_rc_qp_when_roce_disabled = + caps->roce_caps.fl_rc_qp_when_roce_disabled |= MLX5_CAP_ROCE(mdev, fl_rc_qp_when_roce_disabled); caps->roce_caps.fl_rc_qp_when_roce_enabled = MLX5_CAP_ROCE(mdev, fl_rc_qp_when_roce_enabled); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index dc5e2cb302a5..b89778d0d326 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1705,7 +1705,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 rc[0x1]; u8 uar_4k[0x1]; - u8 reserved_at_241[0x9]; + u8 reserved_at_241[0x7]; + u8 fl_rc_qp_when_roce_disabled[0x1]; + u8 regexp_params[0x1]; u8 uar_sz[0x6]; u8 port_selection_cap[0x1]; u8 reserved_at_248[0x1]; -- cgit From 29173d07f79883ac94f5570294f98af3d4287382 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 22 May 2023 19:56:06 -0700 Subject: bpf, sockmap: Convert schedule_work into delayed_work Sk_buffs are fed into sockmap verdict programs either from a strparser (when the user might want to decide how framing of skb is done by attaching another parser program) or directly through tcp_read_sock. The tcp_read_sock is the preferred method for performance when the BPF logic is a stream parser. The flow for Cilium's common use case with a stream parser is, tcp_read_sock() sk_psock_verdict_recv ret = bpf_prog_run_pin_on_cpu() sk_psock_verdict_apply(sock, skb, ret) // if system is under memory pressure or app is slow we may // need to queue skb. Do this queuing through ingress_skb and // then kick timer to wake up handler skb_queue_tail(ingress_skb, skb) schedule_work(work); The work queue is wired up to sk_psock_backlog(). This will then walk the ingress_skb skb list that holds our sk_buffs that could not be handled, but should be OK to run at some later point. However, its possible that the workqueue doing this work still hits an error when sending the skb. When this happens the skbuff is requeued on a temporary 'state' struct kept with the workqueue. This is necessary because its possible to partially send an skbuff before hitting an error and we need to know how and where to restart when the workqueue runs next. Now for the trouble, we don't rekick the workqueue. This can cause a stall where the skbuff we just cached on the state variable might never be sent. This happens when its the last packet in a flow and no further packets come along that would cause the system to kick the workqueue from that side. To fix we could do simple schedule_work(), but while under memory pressure it makes sense to back off some instead of continue to retry repeatedly. So instead to fix convert schedule_work to schedule_delayed_work and add backoff logic to reschedule from backlog queue on errors. Its not obvious though what a good backoff is so use '1'. To test we observed some flakes whil running NGINX compliance test with sockmap we attributed these failed test to this bug and subsequent issue. >From on list discussion. This commit bec217197b41("skmsg: Schedule psock work if the cached skb exists on the psock") was intended to address similar race, but had a couple cases it missed. Most obvious it only accounted for receiving traffic on the local socket so if redirecting into another socket we could still get an sk_buff stuck here. Next it missed the case where copied=0 in the recv() handler and then we wouldn't kick the scheduler. Also its sub-optimal to require userspace to kick the internal mechanisms of sockmap to wake it up and copy data to user. It results in an extra syscall and requires the app to actual handle the EAGAIN correctly. Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: William Findlay Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20230523025618.113937-3-john.fastabend@gmail.com --- include/linux/skmsg.h | 2 +- net/core/skmsg.c | 21 ++++++++++++++------- net/core/sock_map.c | 3 ++- 3 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 84f787416a54..904ff9a32ad6 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -105,7 +105,7 @@ struct sk_psock { struct proto *sk_proto; struct mutex work_mutex; struct sk_psock_work_state work_state; - struct work_struct work; + struct delayed_work work; struct rcu_work rwork; }; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 4a3dc8d27295..0a9ee2acac0b 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -482,7 +482,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, } out: if (psock->work_state.skb && copied > 0) - schedule_work(&psock->work); + schedule_delayed_work(&psock->work, 0); return copied; } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); @@ -640,7 +640,8 @@ static void sk_psock_skb_state(struct sk_psock *psock, static void sk_psock_backlog(struct work_struct *work) { - struct sk_psock *psock = container_of(work, struct sk_psock, work); + struct delayed_work *dwork = to_delayed_work(work); + struct sk_psock *psock = container_of(dwork, struct sk_psock, work); struct sk_psock_work_state *state = &psock->work_state; struct sk_buff *skb = NULL; bool ingress; @@ -680,6 +681,12 @@ start: if (ret == -EAGAIN) { sk_psock_skb_state(psock, state, skb, len, off); + + /* Delay slightly to prioritize any + * other work that might be here. + */ + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_delayed_work(&psock->work, 1); goto end; } /* Hard errors break pipe and stop xmit. */ @@ -734,7 +741,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) INIT_LIST_HEAD(&psock->link); spin_lock_init(&psock->link_lock); - INIT_WORK(&psock->work, sk_psock_backlog); + INIT_DELAYED_WORK(&psock->work, sk_psock_backlog); mutex_init(&psock->work_mutex); INIT_LIST_HEAD(&psock->ingress_msg); spin_lock_init(&psock->ingress_lock); @@ -823,7 +830,7 @@ static void sk_psock_destroy(struct work_struct *work) sk_psock_done_strp(psock); - cancel_work_sync(&psock->work); + cancel_delayed_work_sync(&psock->work); mutex_destroy(&psock->work_mutex); psock_progs_drop(&psock->progs); @@ -938,7 +945,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) } skb_queue_tail(&psock_other->ingress_skb, skb); - schedule_work(&psock_other->work); + schedule_delayed_work(&psock_other->work, 0); spin_unlock_bh(&psock_other->ingress_lock); return 0; } @@ -1018,7 +1025,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { skb_queue_tail(&psock->ingress_skb, skb); - schedule_work(&psock->work); + schedule_delayed_work(&psock->work, 0); err = 0; } spin_unlock_bh(&psock->ingress_lock); @@ -1049,7 +1056,7 @@ static void sk_psock_write_space(struct sock *sk) psock = sk_psock(sk); if (likely(psock)) { if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) - schedule_work(&psock->work); + schedule_delayed_work(&psock->work, 0); write_space = psock->saved_write_space; } rcu_read_unlock(); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 7c189c2e2fbf..00afb66cd095 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1644,9 +1644,10 @@ void sock_map_close(struct sock *sk, long timeout) rcu_read_unlock(); sk_psock_stop(psock); release_sock(sk); - cancel_work_sync(&psock->work); + cancel_delayed_work_sync(&psock->work); sk_psock_put(sk, psock); } + /* Make sure we do not recurse. This is a bug. * Leak the socket instead of crashing on a stack overflow. */ -- cgit From 405df89dd52cbcd69a3cd7d9a10d64de38f854b2 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 22 May 2023 19:56:08 -0700 Subject: bpf, sockmap: Improved check for empty queue We noticed some rare sk_buffs were stepping past the queue when system was under memory pressure. The general theory is to skip enqueueing sk_buffs when its not necessary which is the normal case with a system that is properly provisioned for the task, no memory pressure and enough cpu assigned. But, if we can't allocate memory due to an ENOMEM error when enqueueing the sk_buff into the sockmap receive queue we push it onto a delayed workqueue to retry later. When a new sk_buff is received we then check if that queue is empty. However, there is a problem with simply checking the queue length. When a sk_buff is being processed from the ingress queue but not yet on the sockmap msg receive queue its possible to also recv a sk_buff through normal path. It will check the ingress queue which is zero and then skip ahead of the pkt being processed. Previously we used sock lock from both contexts which made the problem harder to hit, but not impossible. To fix instead of popping the skb from the queue entirely we peek the skb from the queue and do the copy there. This ensures checks to the queue length are non-zero while skb is being processed. Then finally when the entire skb has been copied to user space queue or another socket we pop it off the queue. This way the queue length check allows bypassing the queue only after the list has been completely processed. To reproduce issue we run NGINX compliance test with sockmap running and observe some flakes in our testing that we attributed to this issue. Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()") Suggested-by: Jakub Sitnicki Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Tested-by: William Findlay Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20230523025618.113937-5-john.fastabend@gmail.com --- include/linux/skmsg.h | 1 - net/core/skmsg.c | 32 ++++++++------------------------ 2 files changed, 8 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 904ff9a32ad6..054d7911bfc9 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -71,7 +71,6 @@ struct sk_psock_link { }; struct sk_psock_work_state { - struct sk_buff *skb; u32 len; u32 off; }; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 76ff15f8bb06..bcd45a99a3db 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -622,16 +622,12 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, static void sk_psock_skb_state(struct sk_psock *psock, struct sk_psock_work_state *state, - struct sk_buff *skb, int len, int off) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { - state->skb = skb; state->len = len; state->off = off; - } else { - sock_drop(psock->sk, skb); } spin_unlock_bh(&psock->ingress_lock); } @@ -642,23 +638,17 @@ static void sk_psock_backlog(struct work_struct *work) struct sk_psock *psock = container_of(dwork, struct sk_psock, work); struct sk_psock_work_state *state = &psock->work_state; struct sk_buff *skb = NULL; + u32 len = 0, off = 0; bool ingress; - u32 len, off; int ret; mutex_lock(&psock->work_mutex); - if (unlikely(state->skb)) { - spin_lock_bh(&psock->ingress_lock); - skb = state->skb; + if (unlikely(state->len)) { len = state->len; off = state->off; - state->skb = NULL; - spin_unlock_bh(&psock->ingress_lock); } - if (skb) - goto start; - while ((skb = skb_dequeue(&psock->ingress_skb))) { + while ((skb = skb_peek(&psock->ingress_skb))) { len = skb->len; off = 0; if (skb_bpf_strparser(skb)) { @@ -667,7 +657,6 @@ static void sk_psock_backlog(struct work_struct *work) off = stm->offset; len = stm->full_len; } -start: ingress = skb_bpf_ingress(skb); skb_bpf_redirect_clear(skb); do { @@ -677,8 +666,7 @@ start: len, ingress); if (ret <= 0) { if (ret == -EAGAIN) { - sk_psock_skb_state(psock, state, skb, - len, off); + sk_psock_skb_state(psock, state, len, off); /* Delay slightly to prioritize any * other work that might be here. @@ -690,15 +678,16 @@ start: /* Hard errors break pipe and stop xmit. */ sk_psock_report_error(psock, ret ? -ret : EPIPE); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); - sock_drop(psock->sk, skb); goto end; } off += ret; len -= ret; } while (len); - if (!ingress) + skb = skb_dequeue(&psock->ingress_skb); + if (!ingress) { kfree_skb(skb); + } } end: mutex_unlock(&psock->work_mutex); @@ -791,11 +780,6 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock) skb_bpf_redirect_clear(skb); sock_drop(psock->sk, skb); } - kfree_skb(psock->work_state.skb); - /* We null the skb here to ensure that calls to sk_psock_backlog - * do not pick up the free'd skb. - */ - psock->work_state.skb = NULL; __sk_psock_purge_ingress_msg(psock); } @@ -814,7 +798,6 @@ void sk_psock_stop(struct sk_psock *psock) spin_lock_bh(&psock->ingress_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); sk_psock_cork_free(psock); - __sk_psock_zap_ingress(psock); spin_unlock_bh(&psock->ingress_lock); } @@ -829,6 +812,7 @@ static void sk_psock_destroy(struct work_struct *work) sk_psock_done_strp(psock); cancel_delayed_work_sync(&psock->work); + __sk_psock_zap_ingress(psock); mutex_destroy(&psock->work_mutex); psock_progs_drop(&psock->progs); -- cgit From 335b4223466dd75f9f3ea4918187afbadd22e5c8 Mon Sep 17 00:00:00 2001 From: Maximilian Heyne Date: Wed, 3 May 2023 13:16:53 +0000 Subject: x86/pci/xen: populate MSI sysfs entries Commit bf5e758f02fc ("genirq/msi: Simplify sysfs handling") reworked the creation of sysfs entries for MSI IRQs. The creation used to be in msi_domain_alloc_irqs_descs_locked after calling ops->domain_alloc_irqs. Then it moved into __msi_domain_alloc_irqs which is an implementation of domain_alloc_irqs. However, Xen comes with the only other implementation of domain_alloc_irqs and hence doesn't run the sysfs population code anymore. Commit 6c796996ee70 ("x86/pci/xen: Fixup fallout from the PCI/MSI overhaul") set the flag MSI_FLAG_DEV_SYSFS for the xen msi_domain_info but that doesn't actually have an effect because Xen uses it's own domain_alloc_irqs implementation. Fix this by making use of the fallback functions for sysfs population. Fixes: bf5e758f02fc ("genirq/msi: Simplify sysfs handling") Signed-off-by: Maximilian Heyne Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20230503131656.15928-1-mheyne@amazon.de Signed-off-by: Juergen Gross --- arch/x86/pci/xen.c | 8 +++++--- include/linux/msi.h | 9 ++++++++- kernel/irq/msi.c | 4 ++-- 3 files changed, 15 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 8babce71915f..014c508e914d 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -198,7 +198,7 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) i++; } kfree(v); - return 0; + return msi_device_populate_sysfs(&dev->dev); error: if (ret == -ENOSYS) @@ -254,7 +254,7 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) dev_dbg(&dev->dev, "xen: msi --> pirq=%d --> irq=%d\n", pirq, irq); } - return 0; + return msi_device_populate_sysfs(&dev->dev); error: dev_err(&dev->dev, "Failed to create MSI%s! ret=%d!\n", @@ -346,7 +346,7 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) if (ret < 0) goto out; } - ret = 0; + ret = msi_device_populate_sysfs(&dev->dev); out: return ret; } @@ -394,6 +394,8 @@ static void xen_teardown_msi_irqs(struct pci_dev *dev) xen_destroy_irq(msidesc->irq + i); msidesc->irq = 0; } + + msi_device_destroy_sysfs(&dev->dev); } static void xen_pv_teardown_msi_irqs(struct pci_dev *dev) diff --git a/include/linux/msi.h b/include/linux/msi.h index cdb14a1ef268..a50ea79522f8 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -383,6 +383,13 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc); void arch_teardown_msi_irq(unsigned int irq); int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); void arch_teardown_msi_irqs(struct pci_dev *dev); +#endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */ + +/* + * Xen uses non-default msi_domain_ops and hence needs a way to populate sysfs + * entries of MSI IRQs. + */ +#if defined(CONFIG_PCI_XEN) || defined(CONFIG_PCI_MSI_ARCH_FALLBACKS) #ifdef CONFIG_SYSFS int msi_device_populate_sysfs(struct device *dev); void msi_device_destroy_sysfs(struct device *dev); @@ -390,7 +397,7 @@ void msi_device_destroy_sysfs(struct device *dev); static inline int msi_device_populate_sysfs(struct device *dev) { return 0; } static inline void msi_device_destroy_sysfs(struct device *dev) { } #endif /* !CONFIG_SYSFS */ -#endif /* CONFIG_PCI_MSI_ARCH_FALLBACKS */ +#endif /* CONFIG_PCI_XEN || CONFIG_PCI_MSI_ARCH_FALLBACKS */ /* * The restore hook is still available even for fully irq domain based diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 7a97bcb086bf..b4c31a5c1147 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -542,7 +542,7 @@ fail: return ret; } -#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS +#if defined(CONFIG_PCI_MSI_ARCH_FALLBACKS) || defined(CONFIG_PCI_XEN) /** * msi_device_populate_sysfs - Populate msi_irqs sysfs entries for a device * @dev: The device (PCI, platform etc) which will get sysfs entries @@ -574,7 +574,7 @@ void msi_device_destroy_sysfs(struct device *dev) msi_for_each_desc(desc, dev, MSI_DESC_ALL) msi_sysfs_remove_desc(dev, desc); } -#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK */ +#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK || CONFIG_PCI_XEN */ #else /* CONFIG_SYSFS */ static inline int msi_sysfs_create_group(struct device *dev) { return 0; } static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) { return 0; } -- cgit From 9828ed3f695a138f7add89fa2a186ababceb8006 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 25 May 2023 09:32:25 -0700 Subject: module: error out early on concurrent load of the same module file It turns out that udev under certain circumstances will concurrently try to load the same modules over-and-over excessively. This isn't a kernel bug, but it ends up affecting the kernel, to the point that under certain circumstances we can fail to boot, because the kernel uses a lot of memory to read all the module data all at once. Note that it isn't a memory leak, it's just basically a thundering herd problem happening at bootup with a lot of CPUs, with the worst cases then being pretty bad. Admittedly the worst situations are somewhat contrived: lots and lots of CPUs, not a lot of memory, and KASAN enabled to make it all slower and as such (unintentionally) exacerbate the problem. Luis explains: [1] "My best assessment of the situation is that each CPU in udev ends up triggering a load of duplicate set of modules, not just one, but *a lot*. Not sure what heuristics udev uses to load a set of modules per CPU." Petr Pavlu chimes in: [2] "My understanding is that udev workers are forked. An initial kmod context is created by the main udevd process but no sharing happens after the fork. It means that the mentioned memory pool logic doesn't really kick in. Multiple parallel load requests come from multiple udev workers, for instance, each handling an udev event for one CPU device and making the exactly same requests as all others are doing at the same time. The optimization idea would be to recognize these duplicate requests at the udevd/kmod level and converge them" Note that module loading has tried to mitigate this issue before, see for example commit 064f4536d139 ("module: avoid allocation if module is already present and ready"), which has a few ASCII graphs on memory use due to this same issue. However, while that noticed that the module was already loaded, and exited with an error early before spending any more time on setting up the module, it didn't handle the case of multiple concurrent module loads all being active - but not complete - at the same time. Yes, one of them will eventually win the race and finalize its copy, and the others will then notice that the module already exists and error out, but while this all happens, we have tons of unnecessary concurrent work being done. Again, the real fix is for udev to not do that (maybe it should use threads instead of fork, and have actual shared data structures and not cause duplicate work). That real fix is apparently not trivial. But it turns out that the kernel already has a pretty good model for dealing with concurrent access to the same file: the i_writecount of the inode. In fact, the module loading already indirectly uses 'i_writecount' , because 'kernel_file_read()' will in fact do ret = deny_write_access(file); if (ret) return ret; ... allow_write_access(file); around the read of the file data. We do not allow concurrent writes to the file, and return -ETXTBUSY if the file was open for writing at the same time as the module data is loaded from it. And the solution to the reader concurrency problem is to simply extend this "no concurrent writers" logic to simply be "exclusive access". Note that "exclusive" in this context isn't really some absolute thing: it's only exclusion from writers and from other "special readers" that do this writer denial. So we simply introduce a variation of that "deny_write_access()" logic that not only denies write access, but also requires that this is the _only_ such access that denies write access. Which means that you can't start loading a module that is already being loaded as a module by somebody else, or you will get the same -ETXTBSY error that you would get if there were writers around. [ It also means that you can't try to load a currently executing executable as a module, for the same reason: executables do that same "deny_write_access()" thing, and that's obviously where the whole ETXTBSY logic traditionally came from. This is not a problem for kernel modules, since the set of normal executable files and kernel module files is entirely disjoint. ] This new function is called "exclusive_deny_write_access()", and the implementation is trivial, in that it's just an atomic decrement of i_writecount if it was 0 before. To use that new exclusivity check, all we then do is wrap the module loading with that exclusive_deny_write_access()() / allow_write_access() pair. The actual patch is a bit bigger than that, because we want to surround not just the "load file data" part, but the whole module setup, to get maximum exclusion. So this ends up splitting up "finit_module()" into a few helper functions to make it all very clear and legible. In Luis' test-case (bringing up 255 vcpu's in a virtual machine [3]), the "wasted vmalloc" space (ie module data read into a vmalloc'ed area in order to be loaded as a module, but then discarded because somebody else loaded the same module instead) dropped from 1.8GiB to 474kB. Yes, that's gigabytes to kilobytes. It doesn't drop completely to zero, because even with this change, you can still end up having completely serial pointless module loads, where one udev process has loaded a module fully (and thus the kernel has released that exclusive lock on the module file), and then another udev process tries to load the same module again. So while we cannot fully get rid of the fundamental bug in user space, we _can_ get rid of the excessive concurrent thundering herd effect. A couple of final side notes on this all: - This tweak only affects the "finit_module()" system call, which gives the kernel a file descriptor with the module data. You can also just feed the module data as raw data from user space with "init_module()" (note the lack of 'f' at the beginning), and obviously for that case we do _not_ have any "exclusive read" logic. So if you absolutely want to do things wrong in user space, and try to load the same module multiple times, and error out only later when the kernel ends up saying "you can't load the same module name twice", you can still do that. And in fact, some distros will do exactly that, because they will uncompress the kernel module data in user space before feeding it to the kernel (mainly because they haven't started using the new kernel side decompression yet). So this is not some absolute "you can't do concurrent loads of the same module". It's literally just a very simple heuristic that will catch it early in case you try to load the exact same module file at the same time, and in that case avoid a potentially nasty situation. - There is another user of "deny_write_access()": the verity code that enables fs-verity on a file (the FS_IOC_ENABLE_VERITY ioctl). If you use fs-verity and you care about verifying the kernel modules (which does make sense), you should do it *before* loading said kernel module. That may sound obvious, but now the implementation basically requires it. Because if you try to do it concurrently, the kernel may refuse to load the module file that is being set up by the fs-verity code. - This all will obviously mean that if you insist on loading the same module in parallel, only one module load will succeed, and the others will return with an error. That was true before too, but what is different is that the -ETXTBSY error can be returned *before* the success case of another process fully loading and instantiating the module. Again, that might sound obvious, and it is indeed the whole point of the whole change: we are much quicker to notice the whole "you're already in the process of loading this module". So it's very much intentional, but it does mean that if you just spray the kernel with "finit_module()", and expect that the module is immediately loaded afterwards without checking the return value, you are doing something horribly horribly wrong. I'd like to say that that would never happen, but the whole _reason_ for this commit is that udev is currently doing something horribly horribly wrong, so ... Link: https://lore.kernel.org/all/ZEGopJ8VAYnE7LQ2@bombadil.infradead.org/ [1] Link: https://lore.kernel.org/all/23bd0ce6-ef78-1cd8-1f21-0e706a00424a@suse.com/ [2] Link: https://lore.kernel.org/lkml/ZG%2Fa+nrt4%2FAAUi5z@bombadil.infradead.org/ [3] Cc: Greg Kroah-Hartman Cc: Lucas De Marchi Cc: Petr Pavlu Tested-by: Luis Chamberlain Signed-off-by: Linus Torvalds --- include/linux/fs.h | 6 ++++++ kernel/module/main.c | 58 ++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 49 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 133f0640fb24..86b50271b4f7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2566,6 +2566,12 @@ static inline int deny_write_access(struct file *file) struct inode *inode = file_inode(file); return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY; } +static inline int exclusive_deny_write_access(struct file *file) +{ + int old = 0; + struct inode *inode = file_inode(file); + return atomic_try_cmpxchg(&inode->i_writecount, &old, -1) ? 0 : -ETXTBSY; +} static inline void put_write_access(struct inode * inode) { atomic_dec(&inode->i_writecount); diff --git a/kernel/module/main.c b/kernel/module/main.c index 044aa2c9e3cb..b4c7e925fdb0 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3057,25 +3057,13 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, return load_module(&info, uargs, 0); } -SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) +static int file_init_module(struct file *file, const char __user * uargs, int flags) { struct load_info info = { }; void *buf = NULL; int len; - int err; - - err = may_init_module(); - if (err) - return err; - - pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags); - if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS - |MODULE_INIT_IGNORE_VERMAGIC - |MODULE_INIT_COMPRESSED_FILE)) - return -EINVAL; - - len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL, + len = kernel_read_file(file, 0, &buf, INT_MAX, NULL, READING_MODULE); if (len < 0) { mod_stat_inc(&failed_kreads); @@ -3084,7 +3072,7 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) } if (flags & MODULE_INIT_COMPRESSED_FILE) { - err = module_decompress(&info, buf, len); + int err = module_decompress(&info, buf, len); vfree(buf); /* compressed data is no longer needed */ if (err) { mod_stat_inc(&failed_decompress); @@ -3099,6 +3087,46 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) return load_module(&info, uargs, flags); } +/* + * kernel_read_file() will already deny write access, but module + * loading wants _exclusive_ access to the file, so we do that + * here, along with basic sanity checks. + */ +static int prepare_file_for_module_load(struct file *file) +{ + if (!file || !(file->f_mode & FMODE_READ)) + return -EBADF; + if (!S_ISREG(file_inode(file)->i_mode)) + return -EINVAL; + return exclusive_deny_write_access(file); +} + +SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) +{ + struct fd f; + int err; + + err = may_init_module(); + if (err) + return err; + + pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags); + + if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS + |MODULE_INIT_IGNORE_VERMAGIC + |MODULE_INIT_COMPRESSED_FILE)) + return -EINVAL; + + f = fdget(fd); + err = prepare_file_for_module_load(f.file); + if (!err) { + err = file_init_module(f.file, uargs, flags); + allow_write_access(f.file); + } + fdput(f); + return err; +} + /* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */ char *module_flags(struct module *mod, char *buf, bool show_state) { -- cgit From 0cfb4a1af386427cdaba98f18f501eb074985cfd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 15 Jun 2023 14:58:52 +0100 Subject: genirq: Use BIT() for the IRQD_* state flags As we're about to use the last bit available in the IRQD_* state flags, rewrite these flags with BIT(), which ensures that these constant do not represent a signed value. Signed-off-by: Marc Zyngier --- include/linux/irq.h | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index b1b28affb32a..d9c86db69982 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -226,29 +226,29 @@ struct irq_data { */ enum { IRQD_TRIGGER_MASK = 0xf, - IRQD_SETAFFINITY_PENDING = (1 << 8), - IRQD_ACTIVATED = (1 << 9), - IRQD_NO_BALANCING = (1 << 10), - IRQD_PER_CPU = (1 << 11), - IRQD_AFFINITY_SET = (1 << 12), - IRQD_LEVEL = (1 << 13), - IRQD_WAKEUP_STATE = (1 << 14), - IRQD_MOVE_PCNTXT = (1 << 15), - IRQD_IRQ_DISABLED = (1 << 16), - IRQD_IRQ_MASKED = (1 << 17), - IRQD_IRQ_INPROGRESS = (1 << 18), - IRQD_WAKEUP_ARMED = (1 << 19), - IRQD_FORWARDED_TO_VCPU = (1 << 20), - IRQD_AFFINITY_MANAGED = (1 << 21), - IRQD_IRQ_STARTED = (1 << 22), - IRQD_MANAGED_SHUTDOWN = (1 << 23), - IRQD_SINGLE_TARGET = (1 << 24), - IRQD_DEFAULT_TRIGGER_SET = (1 << 25), - IRQD_CAN_RESERVE = (1 << 26), - IRQD_MSI_NOMASK_QUIRK = (1 << 27), - IRQD_HANDLE_ENFORCE_IRQCTX = (1 << 28), - IRQD_AFFINITY_ON_ACTIVATE = (1 << 29), - IRQD_IRQ_ENABLED_ON_SUSPEND = (1 << 30), + IRQD_SETAFFINITY_PENDING = BIT(8), + IRQD_ACTIVATED = BIT(9), + IRQD_NO_BALANCING = BIT(10), + IRQD_PER_CPU = BIT(11), + IRQD_AFFINITY_SET = BIT(12), + IRQD_LEVEL = BIT(13), + IRQD_WAKEUP_STATE = BIT(14), + IRQD_MOVE_PCNTXT = BIT(15), + IRQD_IRQ_DISABLED = BIT(16), + IRQD_IRQ_MASKED = BIT(17), + IRQD_IRQ_INPROGRESS = BIT(18), + IRQD_WAKEUP_ARMED = BIT(19), + IRQD_FORWARDED_TO_VCPU = BIT(20), + IRQD_AFFINITY_MANAGED = BIT(21), + IRQD_IRQ_STARTED = BIT(22), + IRQD_MANAGED_SHUTDOWN = BIT(23), + IRQD_SINGLE_TARGET = BIT(24), + IRQD_DEFAULT_TRIGGER_SET = BIT(25), + IRQD_CAN_RESERVE = BIT(26), + IRQD_MSI_NOMASK_QUIRK = BIT(27), + IRQD_HANDLE_ENFORCE_IRQCTX = BIT(28), + IRQD_AFFINITY_ON_ACTIVATE = BIT(29), + IRQD_IRQ_ENABLED_ON_SUSPEND = BIT(30), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) -- cgit From 9c15eeb5362c48dd27d51bd72e8873341fa9383c Mon Sep 17 00:00:00 2001 From: James Gowans Date: Thu, 8 Jun 2023 14:00:20 +0200 Subject: genirq: Allow fasteoi handler to resend interrupts on concurrent handling There is a class of interrupt controllers out there that, once they have signalled a given interrupt number, will still signal incoming instances of the *same* interrupt despite the original interrupt not having been EOIed yet. As long as the new interrupt reaches the *same* CPU, nothing bad happens, as that CPU still has its interrupts globally disabled, and we will only take the new interrupt once the interrupt has been EOIed. However, things become more "interesting" if an affinity change comes in while the interrupt is being handled. More specifically, while the per-irq lock is being dropped. This results in the affinity change taking place immediately. At this point, there is nothing that prevents the interrupt from firing on the new target CPU. We end-up with the interrupt running concurrently on two CPUs, which isn't a good thing. And that's where things become worse: the new CPU notices that the interrupt handling is in progress (irq_may_run() return false), and *drops the interrupt on the floor*. The whole race looks like this: CPU 0 | CPU 1 -----------------------------|----------------------------- interrupt start | handle_fasteoi_irq | set_affinity(CPU 1) handler | ... | interrupt start ... | handle_fasteoi_irq -> early out handle_fasteoi_irq return | interrupt end interrupt end | If the interrupt was an edge, too bad. The interrupt is lost, and the system will eventually die one way or another. Not great. A way to avoid this situation is to detect this problem at the point we handle the interrupt on the new target. Instead of dropping the interrupt, use the resend mechanism to force it to be replayed. Also, in order to limit the impact of this workaround to the pathetic architectures that require it, gate it behind a new irq flag aptly named IRQD_RESEND_WHEN_IN_PROGRESS. Suggested-by: Marc Zyngier Signed-off-by: James Gowans Cc: Thomas Gleixner Cc: Marc Zyngier Cc: KarimAllah Raslan Cc: Yipeng Zou Cc: Zhang Jianhua [maz: reworded commit mesage] Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230608120021.3273400-3-jgowans@amazon.com --- include/linux/irq.h | 13 +++++++++++++ kernel/irq/chip.c | 16 +++++++++++++++- kernel/irq/debugfs.c | 2 ++ 3 files changed, 30 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index d9c86db69982..d8a6fdce9373 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -223,6 +223,8 @@ struct irq_data { * irq_chip::irq_set_affinity() when deactivated. * IRQD_IRQ_ENABLED_ON_SUSPEND - Interrupt is enabled on suspend by irq pm if * irqchip have flag IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND set. + * IRQD_RESEND_WHEN_IN_PROGRESS - Interrupt may fire when already in progress in which + * case it must be resent at the next available opportunity. */ enum { IRQD_TRIGGER_MASK = 0xf, @@ -249,6 +251,7 @@ enum { IRQD_HANDLE_ENFORCE_IRQCTX = BIT(28), IRQD_AFFINITY_ON_ACTIVATE = BIT(29), IRQD_IRQ_ENABLED_ON_SUSPEND = BIT(30), + IRQD_RESEND_WHEN_IN_PROGRESS = BIT(31), }; #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) @@ -448,6 +451,16 @@ static inline bool irqd_affinity_on_activate(struct irq_data *d) return __irqd_to_state(d) & IRQD_AFFINITY_ON_ACTIVATE; } +static inline void irqd_set_resend_when_in_progress(struct irq_data *d) +{ + __irqd_to_state(d) |= IRQD_RESEND_WHEN_IN_PROGRESS; +} + +static inline bool irqd_needs_resend_when_in_progress(struct irq_data *d) +{ + return __irqd_to_state(d) & IRQD_RESEND_WHEN_IN_PROGRESS; +} + #undef __irqd_to_state static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 49e7bc871fec..57cd8f475302 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -692,8 +692,16 @@ void handle_fasteoi_irq(struct irq_desc *desc) raw_spin_lock(&desc->lock); - if (!irq_may_run(desc)) + /* + * When an affinity change races with IRQ handling, the next interrupt + * can arrive on the new CPU before the original CPU has completed + * handling the previous one - it may need to be resent. + */ + if (!irq_may_run(desc)) { + if (irqd_needs_resend_when_in_progress(&desc->irq_data)) + desc->istate |= IRQS_PENDING; goto out; + } desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); @@ -715,6 +723,12 @@ void handle_fasteoi_irq(struct irq_desc *desc) cond_unmask_eoi_irq(desc, chip); + /* + * When the race described above happens this will resend the interrupt. + */ + if (unlikely(desc->istate & IRQS_PENDING)) + check_irq_resend(desc, false); + raw_spin_unlock(&desc->lock); return; out: diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index bbcaac64038e..5971a66be034 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -133,6 +133,8 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX), BIT_MASK_DESCR(IRQD_IRQ_ENABLED_ON_SUSPEND), + + BIT_MASK_DESCR(IRQD_RESEND_WHEN_IN_PROGRESS), }; static const struct irq_bit_descr irqdesc_states[] = { -- cgit From f1771b85e3086c9506c3de81e993330bca568ba5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 May 2023 22:05:05 +0200 Subject: irqchip/mmp: Remove non-DT codepath Building with "W=1" warns about missing declarations for two functions in the mmp irqchip driver: drivers/irqchip/irq-mmp.c:248:13: error: no previous prototype for 'icu_init_irq' drivers/irqchip/irq-mmp.c:271:13: error: no previous prototype for 'mmp2_init_icu' The declarations are present in an unused header, but since there is no caller, it's best to just remove the functions and the header completely, making the driver DT-only to match the state of the platform. Fixes: 77acc85ce797 ("ARM: mmp: remove device definitions") Signed-off-by: Arnd Bergmann Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230516200516.554663-2-arnd@kernel.org --- drivers/irqchip/irq-mmp.c | 127 -------------------------------------------- include/linux/irqchip/mmp.h | 10 ---- 2 files changed, 137 deletions(-) delete mode 100644 include/linux/irqchip/mmp.h (limited to 'include/linux') diff --git a/drivers/irqchip/irq-mmp.c b/drivers/irqchip/irq-mmp.c index 83455ca72439..25cf4f80e767 100644 --- a/drivers/irqchip/irq-mmp.c +++ b/drivers/irqchip/irq-mmp.c @@ -244,132 +244,6 @@ static void __exception_irq_entry mmp2_handle_irq(struct pt_regs *regs) generic_handle_domain_irq(icu_data[0].domain, hwirq); } -/* MMP (ARMv5) */ -void __init icu_init_irq(void) -{ - int irq; - - max_icu_nr = 1; - mmp_icu_base = ioremap(0xd4282000, 0x1000); - icu_data[0].conf_enable = mmp_conf.conf_enable; - icu_data[0].conf_disable = mmp_conf.conf_disable; - icu_data[0].conf_mask = mmp_conf.conf_mask; - icu_data[0].nr_irqs = 64; - icu_data[0].virq_base = 0; - icu_data[0].domain = irq_domain_add_legacy(NULL, 64, 0, 0, - &irq_domain_simple_ops, - &icu_data[0]); - for (irq = 0; irq < 64; irq++) { - icu_mask_irq(irq_get_irq_data(irq)); - irq_set_chip_and_handler(irq, &icu_irq_chip, handle_level_irq); - } - irq_set_default_host(icu_data[0].domain); - set_handle_irq(mmp_handle_irq); -} - -/* MMP2 (ARMv7) */ -void __init mmp2_init_icu(void) -{ - int irq, end; - - max_icu_nr = 8; - mmp_icu_base = ioremap(0xd4282000, 0x1000); - icu_data[0].conf_enable = mmp2_conf.conf_enable; - icu_data[0].conf_disable = mmp2_conf.conf_disable; - icu_data[0].conf_mask = mmp2_conf.conf_mask; - icu_data[0].nr_irqs = 64; - icu_data[0].virq_base = 0; - icu_data[0].domain = irq_domain_add_legacy(NULL, 64, 0, 0, - &irq_domain_simple_ops, - &icu_data[0]); - icu_data[1].reg_status = mmp_icu_base + 0x150; - icu_data[1].reg_mask = mmp_icu_base + 0x168; - icu_data[1].clr_mfp_irq_base = icu_data[0].virq_base + - icu_data[0].nr_irqs; - icu_data[1].clr_mfp_hwirq = 1; /* offset to IRQ_MMP2_PMIC_BASE */ - icu_data[1].nr_irqs = 2; - icu_data[1].cascade_irq = 4; - icu_data[1].virq_base = icu_data[0].virq_base + icu_data[0].nr_irqs; - icu_data[1].domain = irq_domain_add_legacy(NULL, icu_data[1].nr_irqs, - icu_data[1].virq_base, 0, - &irq_domain_simple_ops, - &icu_data[1]); - icu_data[2].reg_status = mmp_icu_base + 0x154; - icu_data[2].reg_mask = mmp_icu_base + 0x16c; - icu_data[2].nr_irqs = 2; - icu_data[2].cascade_irq = 5; - icu_data[2].virq_base = icu_data[1].virq_base + icu_data[1].nr_irqs; - icu_data[2].domain = irq_domain_add_legacy(NULL, icu_data[2].nr_irqs, - icu_data[2].virq_base, 0, - &irq_domain_simple_ops, - &icu_data[2]); - icu_data[3].reg_status = mmp_icu_base + 0x180; - icu_data[3].reg_mask = mmp_icu_base + 0x17c; - icu_data[3].nr_irqs = 3; - icu_data[3].cascade_irq = 9; - icu_data[3].virq_base = icu_data[2].virq_base + icu_data[2].nr_irqs; - icu_data[3].domain = irq_domain_add_legacy(NULL, icu_data[3].nr_irqs, - icu_data[3].virq_base, 0, - &irq_domain_simple_ops, - &icu_data[3]); - icu_data[4].reg_status = mmp_icu_base + 0x158; - icu_data[4].reg_mask = mmp_icu_base + 0x170; - icu_data[4].nr_irqs = 5; - icu_data[4].cascade_irq = 17; - icu_data[4].virq_base = icu_data[3].virq_base + icu_data[3].nr_irqs; - icu_data[4].domain = irq_domain_add_legacy(NULL, icu_data[4].nr_irqs, - icu_data[4].virq_base, 0, - &irq_domain_simple_ops, - &icu_data[4]); - icu_data[5].reg_status = mmp_icu_base + 0x15c; - icu_data[5].reg_mask = mmp_icu_base + 0x174; - icu_data[5].nr_irqs = 15; - icu_data[5].cascade_irq = 35; - icu_data[5].virq_base = icu_data[4].virq_base + icu_data[4].nr_irqs; - icu_data[5].domain = irq_domain_add_legacy(NULL, icu_data[5].nr_irqs, - icu_data[5].virq_base, 0, - &irq_domain_simple_ops, - &icu_data[5]); - icu_data[6].reg_status = mmp_icu_base + 0x160; - icu_data[6].reg_mask = mmp_icu_base + 0x178; - icu_data[6].nr_irqs = 2; - icu_data[6].cascade_irq = 51; - icu_data[6].virq_base = icu_data[5].virq_base + icu_data[5].nr_irqs; - icu_data[6].domain = irq_domain_add_legacy(NULL, icu_data[6].nr_irqs, - icu_data[6].virq_base, 0, - &irq_domain_simple_ops, - &icu_data[6]); - icu_data[7].reg_status = mmp_icu_base + 0x188; - icu_data[7].reg_mask = mmp_icu_base + 0x184; - icu_data[7].nr_irqs = 2; - icu_data[7].cascade_irq = 55; - icu_data[7].virq_base = icu_data[6].virq_base + icu_data[6].nr_irqs; - icu_data[7].domain = irq_domain_add_legacy(NULL, icu_data[7].nr_irqs, - icu_data[7].virq_base, 0, - &irq_domain_simple_ops, - &icu_data[7]); - end = icu_data[7].virq_base + icu_data[7].nr_irqs; - for (irq = 0; irq < end; irq++) { - icu_mask_irq(irq_get_irq_data(irq)); - if (irq == icu_data[1].cascade_irq || - irq == icu_data[2].cascade_irq || - irq == icu_data[3].cascade_irq || - irq == icu_data[4].cascade_irq || - irq == icu_data[5].cascade_irq || - irq == icu_data[6].cascade_irq || - irq == icu_data[7].cascade_irq) { - irq_set_chip(irq, &icu_irq_chip); - irq_set_chained_handler(irq, icu_mux_irq_demux); - } else { - irq_set_chip_and_handler(irq, &icu_irq_chip, - handle_level_irq); - } - } - irq_set_default_host(icu_data[0].domain); - set_handle_irq(mmp2_handle_irq); -} - -#ifdef CONFIG_OF static int __init mmp_init_bases(struct device_node *node) { int ret, nr_irqs, irq, i = 0; @@ -548,4 +422,3 @@ err: return -EINVAL; } IRQCHIP_DECLARE(mmp2_mux_intc, "mrvl,mmp2-mux-intc", mmp2_mux_of_init); -#endif diff --git a/include/linux/irqchip/mmp.h b/include/linux/irqchip/mmp.h deleted file mode 100644 index aa1813749a4f..000000000000 --- a/include/linux/irqchip/mmp.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __IRQCHIP_MMP_H -#define __IRQCHIP_MMP_H - -extern struct irq_chip icu_irq_chip; - -extern void icu_init_irq(void); -extern void mmp2_init_icu(void); - -#endif /* __IRQCHIP_MMP_H */ -- cgit From 415e84294798d1cb041c902168393054cc4ad211 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 16 May 2023 22:05:08 +0200 Subject: irqchip/gicv3: Add a iort_pmsi_get_dev_id() prototype iort_pmsi_get_dev_id() has a __weak definition in the driver, and an override in arm64 specific code, but the declaration is conditional and not always seen when the copy in the driver gets built: drivers/irqchip/irq-gic-v3-its-platform-msi.c:41:12: error: no previous prototype for 'iort_pmsi_get_dev_id' [-Werror=missing-prototypes] Move the existing declaration out of the #ifdef block to ensure it can be seen in all configurations. Signed-off-by: Arnd Bergmann Reviewed-by: Hanjun Guo Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230516200516.554663-5-arnd@kernel.org --- include/linux/acpi_iort.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h index b43be0987b19..6b70d02bc5f9 100644 --- a/include/linux/acpi_iort.h +++ b/include/linux/acpi_iort.h @@ -26,13 +26,14 @@ int iort_register_domain_token(int trans_id, phys_addr_t base, struct fwnode_handle *fw_node); void iort_deregister_domain_token(int trans_id); struct fwnode_handle *iort_find_domain_token(int trans_id); +int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id); + #ifdef CONFIG_ACPI_IORT void acpi_iort_init(void); u32 iort_msi_map_id(struct device *dev, u32 id); struct irq_domain *iort_get_device_domain(struct device *dev, u32 id, enum irq_domain_bus_token bus_token); void acpi_configure_pmsi_domain(struct device *dev); -int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id); void iort_get_rmr_sids(struct fwnode_handle *iommu_fwnode, struct list_head *head); void iort_put_rmr_sids(struct fwnode_handle *iommu_fwnode, -- cgit