From 07fd5b6cdf3cc30bfde8fe0f644771688be04447 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 13 Jun 2022 12:19:50 -1000 Subject: cgroup: Use separate src/dst nodes when preloading css_sets for migration Each cset (css_set) is pinned by its tasks. When we're moving tasks around across csets for a migration, we need to hold the source and destination csets to ensure that they don't go away while we're moving tasks about. This is done by linking cset->mg_preload_node on either the mgctx->preloaded_src_csets or mgctx->preloaded_dst_csets list. Using the same cset->mg_preload_node for both the src and dst lists was deemed okay as a cset can't be both the source and destination at the same time. Unfortunately, this overloading becomes problematic when multiple tasks are involved in a migration and some of them are identity noop migrations while others are actually moving across cgroups. For example, this can happen with the following sequence on cgroup1: #1> mkdir -p /sys/fs/cgroup/misc/a/b #2> echo $$ > /sys/fs/cgroup/misc/a/cgroup.procs #3> RUN_A_COMMAND_WHICH_CREATES_MULTIPLE_THREADS & #4> PID=$! #5> echo $PID > /sys/fs/cgroup/misc/a/b/tasks #6> echo $PID > /sys/fs/cgroup/misc/a/cgroup.procs the process including the group leader back into a. In this final migration, non-leader threads would be doing identity migration while the group leader is doing an actual one. After #3, let's say the whole process was in cset A, and that after #4, the leader moves to cset B. Then, during #6, the following happens: 1. cgroup_migrate_add_src() is called on B for the leader. 2. cgroup_migrate_add_src() is called on A for the other threads. 3. cgroup_migrate_prepare_dst() is called. It scans the src list. 4. It notices that B wants to migrate to A, so it tries to A to the dst list but realizes that its ->mg_preload_node is already busy. 5. and then it notices A wants to migrate to A as it's an identity migration, it culls it by list_del_init()'ing its ->mg_preload_node and putting references accordingly. 6. The rest of migration takes place with B on the src list but nothing on the dst list. This means that A isn't held while migration is in progress. If all tasks leave A before the migration finishes and the incoming task pins it, the cset will be destroyed leading to use-after-free. This is caused by overloading cset->mg_preload_node for both src and dst preload lists. We wanted to exclude the cset from the src list but ended up inadvertently excluding it from the dst list too. This patch fixes the issue by separating out cset->mg_preload_node into ->mg_src_preload_node and ->mg_dst_preload_node, so that the src and dst preloadings don't interfere with each other. Signed-off-by: Tejun Heo Reported-by: Mukesh Ojha Reported-by: shisiyuan Link: http://lkml.kernel.org/r/1654187688-27411-1-git-send-email-shisiyuan@xiaomi.com Link: https://www.spinics.net/lists/cgroups/msg33313.html Fixes: f817de98513d ("cgroup: prepare migration path for unified hierarchy") Cc: stable@vger.kernel.org # v3.16+ --- include/linux/cgroup-defs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1bfcfb1af352..d4427d0a0e18 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -264,7 +264,8 @@ struct css_set { * List of csets participating in the on-going migration either as * source or destination. Protected by cgroup_mutex. */ - struct list_head mg_preload_node; + struct list_head mg_src_preload_node; + struct list_head mg_dst_preload_node; struct list_head mg_node; /* -- cgit From f50974eee5c4a5de1e4f1a3d873099f170df25f8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Thu, 23 Jun 2022 13:02:31 -0700 Subject: memregion: Fix memregion_free() fallback definition In the CONFIG_MEMREGION=n case, memregion_free() is meant to be a static inline. 0day reports: In file included from drivers/cxl/core/port.c:4: include/linux/memregion.h:19:6: warning: no previous prototype for function 'memregion_free' [-Wmissing-prototypes] Mark memregion_free() static. Fixes: 33dd70752cd7 ("lib: Uplevel the pmem "region" ida to a global allocator") Reported-by: kernel test robot Reviewed-by: Alison Schofield Link: https://lore.kernel.org/r/165601455171.4042645.3350844271068713515.stgit@dwillia2-xfh Signed-off-by: Dan Williams --- include/linux/memregion.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memregion.h b/include/linux/memregion.h index e11595256cac..c04c4fd2e209 100644 --- a/include/linux/memregion.h +++ b/include/linux/memregion.h @@ -16,7 +16,7 @@ static inline int memregion_alloc(gfp_t gfp) { return -ENOMEM; } -void memregion_free(int id) +static inline void memregion_free(int id) { } #endif -- cgit From c346dae4f3fbce51bbd4f2ec5e8c6f9b91e93163 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 22 Jun 2022 09:29:40 +0800 Subject: virtio: disable notification hardening by default We try to harden virtio device notifications in 8b4ec69d7e09 ("virtio: harden vring IRQ"). It works with the assumption that the driver or core can properly call virtio_device_ready() at the right place. Unfortunately, this seems to be not true and uncover various bugs of the existing drivers, mainly the issue of using virtio_device_ready() incorrectly. So let's add a Kconfig option and disable it by default. It gives us time to fix the drivers and then we can consider re-enabling it. Signed-off-by: Jason Wang Message-Id: <20220622012940.21441-1-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Cornelia Huck --- drivers/s390/virtio/virtio_ccw.c | 9 ++++++++- drivers/virtio/Kconfig | 13 +++++++++++++ drivers/virtio/virtio.c | 2 ++ drivers/virtio/virtio_ring.c | 12 ++++++++++++ include/linux/virtio_config.h | 2 ++ 5 files changed, 37 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c index 97e51c34e6cf..161d3b141f0d 100644 --- a/drivers/s390/virtio/virtio_ccw.c +++ b/drivers/s390/virtio/virtio_ccw.c @@ -1136,8 +1136,13 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev, vcdev->err = -EIO; } virtio_ccw_check_activity(vcdev, activity); - /* Interrupts are disabled here */ +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION + /* + * Paired with virtio_ccw_synchronize_cbs() and interrupts are + * disabled here. + */ read_lock(&vcdev->irq_lock); +#endif for_each_set_bit(i, indicators(vcdev), sizeof(*indicators(vcdev)) * BITS_PER_BYTE) { /* The bit clear must happen before the vring kick. */ @@ -1146,7 +1151,9 @@ static void virtio_ccw_int_handler(struct ccw_device *cdev, vq = virtio_ccw_vq_by_ind(vcdev, i); vring_interrupt(0, vq); } +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION read_unlock(&vcdev->irq_lock); +#endif if (test_bit(0, indicators2(vcdev))) { virtio_config_changed(&vcdev->vdev); clear_bit(0, indicators2(vcdev)); diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index a6dc8b5846fe..e1556d2a355a 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -29,6 +29,19 @@ menuconfig VIRTIO_MENU if VIRTIO_MENU +config VIRTIO_HARDEN_NOTIFICATION + bool "Harden virtio notification" + help + Enable this to harden the device notifications and suppress + those that happen at a time where notifications are illegal. + + Experimental: Note that several drivers still have bugs that + may cause crashes or hangs when correct handling of + notifications is enforced; depending on the subset of + drivers and devices you use, this may or may not work. + + If unsure, say N. + config VIRTIO_PCI tristate "PCI driver for virtio devices" depends on PCI diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 6bace84ae37e..7deeed30d1f3 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -219,6 +219,7 @@ static int virtio_features_ok(struct virtio_device *dev) * */ void virtio_reset_device(struct virtio_device *dev) { +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION /* * The below virtio_synchronize_cbs() guarantees that any * interrupt for this line arriving after @@ -227,6 +228,7 @@ void virtio_reset_device(struct virtio_device *dev) */ virtio_break_device(dev); virtio_synchronize_cbs(dev); +#endif dev->config->reset(dev); } diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index ef30465dd39c..96731cdef422 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -1708,7 +1708,11 @@ static struct virtqueue *vring_create_virtqueue_packed( vq->we_own_ring = true; vq->notify = notify; vq->weak_barriers = weak_barriers; +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION vq->broken = true; +#else + vq->broken = false; +#endif vq->last_used_idx = 0 | (1 << VRING_PACKED_EVENT_F_WRAP_CTR); vq->event_triggered = false; vq->num_added = 0; @@ -2154,9 +2158,13 @@ irqreturn_t vring_interrupt(int irq, void *_vq) } if (unlikely(vq->broken)) { +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION dev_warn_once(&vq->vq.vdev->dev, "virtio vring IRQ raised before DRIVER_OK"); return IRQ_NONE; +#else + return IRQ_HANDLED; +#endif } /* Just a hint for performance: so it's ok that this can be racy! */ @@ -2199,7 +2207,11 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, vq->we_own_ring = false; vq->notify = notify; vq->weak_barriers = weak_barriers; +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION vq->broken = true; +#else + vq->broken = false; +#endif vq->last_used_idx = 0; vq->event_triggered = false; vq->num_added = 0; diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 49c7c32815f1..b47c2e7ed0ee 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -257,6 +257,7 @@ void virtio_device_ready(struct virtio_device *dev) WARN_ON(status & VIRTIO_CONFIG_S_DRIVER_OK); +#ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION /* * The virtio_synchronize_cbs() makes sure vring_interrupt() * will see the driver specific setup if it sees vq->broken @@ -264,6 +265,7 @@ void virtio_device_ready(struct virtio_device *dev) */ virtio_synchronize_cbs(dev); __virtio_unbreak_device(dev); +#endif /* * The transport should ensure the visibility of vq->broken * before setting DRIVER_OK. See the comments for the transport -- cgit From 3b89b511ea0c705cc418440e2abf9d692a556d84 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 23 Jun 2022 16:32:32 +0300 Subject: net: fix IFF_TX_SKB_NO_LINEAR definition The "1<<31" shift has a sign extension bug so IFF_TX_SKB_NO_LINEAR is 0xffffffff80000000 instead of 0x0000000080000000. Fixes: c2ff53d8049f ("net: Add priv_flags for allow tx skb without linear") Signed-off-by: Dan Carpenter Reviewed-by: Xuan Zhuo Link: https://lore.kernel.org/r/YrRrcGttfEVnf85Q@kili Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f615a66c89e9..2563d30736e9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1671,7 +1671,7 @@ enum netdev_priv_flags { IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, IFF_LIVE_RENAME_OK = 1<<30, - IFF_TX_SKB_NO_LINEAR = 1<<31, + IFF_TX_SKB_NO_LINEAR = BIT_ULL(31), IFF_CHANGE_PROTO_DOWN = BIT_ULL(32), }; -- cgit From 742ab6df974ae8384a2dd213db1a3a06cf6d8936 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 14 Jun 2022 23:15:32 +0200 Subject: x86/kvm/vmx: Make noinstr clean The recent mmio_stale_data fixes broke the noinstr constraints: vmlinux.o: warning: objtool: vmx_vcpu_enter_exit+0x15b: call to wrmsrl.constprop.0() leaves .noinstr.text section vmlinux.o: warning: objtool: vmx_vcpu_enter_exit+0x1bf: call to kvm_arch_has_assigned_device() leaves .noinstr.text section make it all happy again. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov --- arch/x86/kvm/vmx/vmx.c | 6 +++--- arch/x86/kvm/x86.c | 4 ++-- include/linux/kvm_host.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 3a919e49129b..009bbae9ad66 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -383,9 +383,9 @@ static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx) if (!vmx->disable_fb_clear) return; - rdmsrl(MSR_IA32_MCU_OPT_CTRL, msr); + msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL); msr |= FB_CLEAR_DIS; - wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); + native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr); /* Cache the MSR value to avoid reading it later */ vmx->msr_ia32_mcu_opt_ctrl = msr; } @@ -396,7 +396,7 @@ static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx) return; vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS; - wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); + native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl); } static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1910e1e78b15..26d0cac32f73 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12631,9 +12631,9 @@ void kvm_arch_end_assignment(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_end_assignment); -bool kvm_arch_has_assigned_device(struct kvm *kvm) +bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm) { - return atomic_read(&kvm->arch.assigned_device_count); + return arch_atomic_read(&kvm->arch.assigned_device_count); } EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c20f2d55840c..83cf7fd842e0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1513,7 +1513,7 @@ static inline void kvm_arch_end_assignment(struct kvm *kvm) { } -static inline bool kvm_arch_has_assigned_device(struct kvm *kvm) +static __always_inline bool kvm_arch_has_assigned_device(struct kvm *kvm) { return false; } -- cgit From 6b80b59b3555706508008f1f127b5412c89c7fd8 Mon Sep 17 00:00:00 2001 From: Alexandre Chartre Date: Tue, 14 Jun 2022 23:15:49 +0200 Subject: x86/bugs: Report AMD retbleed vulnerability Report that AMD x86 CPUs are vulnerable to the RETBleed (Arbitrary Speculative Code Execution with Return Instructions) attack. [peterz: add hygon] [kim: invert parity; fam15h] Co-developed-by: Kim Phillips Signed-off-by: Kim Phillips Signed-off-by: Alexandre Chartre Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Josh Poimboeuf Signed-off-by: Borislav Petkov --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/bugs.c | 13 +++++++++++++ arch/x86/kernel/cpu/common.c | 19 +++++++++++++++++++ drivers/base/cpu.c | 8 ++++++++ include/linux/cpu.h | 2 ++ 5 files changed, 43 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index fa3d0db1470e..c16503ca3b75 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -453,5 +453,6 @@ #define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ #define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ #define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ +#define X86_BUG_RETBLEED X86_BUG(26) /* CPU is affected by RETBleed */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 74c62cc47a5f..74fdd21e416b 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1994,6 +1994,11 @@ static ssize_t srbds_show_state(char *buf) return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); } +static ssize_t retbleed_show_state(char *buf) +{ + return sprintf(buf, "Vulnerable\n"); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -2039,6 +2044,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_MMIO_STALE_DATA: return mmio_stale_data_show_state(buf); + case X86_BUG_RETBLEED: + return retbleed_show_state(buf); + default: break; } @@ -2095,4 +2103,9 @@ ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *at { return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); } + +ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4730b0a58f24..4089c173c6ae 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1205,16 +1205,27 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { {} }; +#define VULNBL(vendor, family, model, blacklist) \ + X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, blacklist) + #define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ INTEL_FAM6_##model, steppings, \ X86_FEATURE_ANY, issues) +#define VULNBL_AMD(family, blacklist) \ + VULNBL(AMD, family, X86_MODEL_ANY, blacklist) + +#define VULNBL_HYGON(family, blacklist) \ + VULNBL(HYGON, family, X86_MODEL_ANY, blacklist) + #define SRBDS BIT(0) /* CPU is affected by X86_BUG_MMIO_STALE_DATA */ #define MMIO BIT(1) /* CPU is affected by Shared Buffers Data Sampling (SBDS), a variant of X86_BUG_MMIO_STALE_DATA */ #define MMIO_SBDS BIT(2) +/* CPU is affected by RETbleed, speculating where you would not expect it */ +#define RETBLEED BIT(3) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), @@ -1247,6 +1258,11 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(ATOM_TREMONT, X86_STEPPINGS(0x1, 0x1), MMIO | MMIO_SBDS), VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_D, X86_STEPPING_ANY, MMIO), VULNBL_INTEL_STEPPINGS(ATOM_TREMONT_L, X86_STEPPINGS(0x0, 0x0), MMIO | MMIO_SBDS), + + VULNBL_AMD(0x15, RETBLEED), + VULNBL_AMD(0x16, RETBLEED), + VULNBL_AMD(0x17, RETBLEED), + VULNBL_HYGON(0x18, RETBLEED), {} }; @@ -1348,6 +1364,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) !arch_cap_mmio_immune(ia32_cap)) setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); + if (cpu_matches(cpu_vuln_blacklist, RETBLEED)) + setup_force_cpu_bug(X86_BUG_RETBLEED); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index a97776ea9d99..4c98849577d4 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -570,6 +570,12 @@ ssize_t __weak cpu_show_mmio_stale_data(struct device *dev, return sysfs_emit(buf, "Not affected\n"); } +ssize_t __weak cpu_show_retbleed(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "Not affected\n"); +} + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); @@ -580,6 +586,7 @@ static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL); static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL); static DEVICE_ATTR(srbds, 0444, cpu_show_srbds, NULL); static DEVICE_ATTR(mmio_stale_data, 0444, cpu_show_mmio_stale_data, NULL); +static DEVICE_ATTR(retbleed, 0444, cpu_show_retbleed, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -592,6 +599,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_itlb_multihit.attr, &dev_attr_srbds.attr, &dev_attr_mmio_stale_data.attr, + &dev_attr_retbleed.attr, NULL }; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 2c7477354744..314802f98b9d 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -68,6 +68,8 @@ extern ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, extern ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_retbleed(struct device *dev, + struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, -- cgit From a09a6e2399ba0595c3042b3164f3ca68a3cff33e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 14 Jun 2022 23:16:03 +0200 Subject: objtool: Add entry UNRET validation Since entry asm is tricky, add a validation pass that ensures the retbleed mitigation has been done before the first actual RET instruction. Entry points are those that either have UNWIND_HINT_ENTRY, which acts as UNWIND_HINT_EMPTY but marks the instruction as an entry point, or those that have UWIND_HINT_IRET_REGS at +0. This is basically a variant of validate_branch() that is intra-function and it will simply follow all branches from marked entry points and ensures that all paths lead to ANNOTATE_UNRET_END. If a path hits RET or an indirection the path is a fail and will be reported. There are 3 ANNOTATE_UNRET_END instances: - UNTRAIN_RET itself - exception from-kernel; this path doesn't need UNTRAIN_RET - all early exceptions; these also don't need UNTRAIN_RET Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Josh Poimboeuf Signed-off-by: Borislav Petkov --- arch/x86/entry/entry_64.S | 3 +- arch/x86/entry/entry_64_compat.S | 6 +- arch/x86/include/asm/nospec-branch.h | 12 +++ arch/x86/include/asm/unwind_hints.h | 4 + arch/x86/kernel/head_64.S | 5 + arch/x86/xen/xen-asm.S | 10 +- include/linux/objtool.h | 3 + scripts/Makefile.vmlinux_o | 2 +- tools/include/linux/objtool.h | 3 + tools/objtool/builtin-check.c | 6 ++ tools/objtool/check.c | 177 ++++++++++++++++++++++++++++++-- tools/objtool/include/objtool/builtin.h | 1 + tools/objtool/include/objtool/check.h | 11 +- 13 files changed, 222 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 0c88802e1155..65e3b8b7cbe5 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -85,7 +85,7 @@ */ SYM_CODE_START(entry_SYSCALL_64) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR swapgs @@ -1095,6 +1095,7 @@ SYM_CODE_START_LOCAL(error_entry) .Lerror_entry_done_lfence: FENCE_SWAPGS_KERNEL_ENTRY leaq 8(%rsp), %rax /* return pt_regs pointer */ + ANNOTATE_UNRET_END RET .Lbstep_iret: diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index bcb89d23ac0e..682338e7e2a3 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -49,7 +49,7 @@ * 0(%ebp) arg6 */ SYM_CODE_START(entry_SYSENTER_compat) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR /* Interrupts are off on entry. */ swapgs @@ -179,7 +179,7 @@ SYM_CODE_END(entry_SYSENTER_compat) * 0(%esp) arg6 */ SYM_CODE_START(entry_SYSCALL_compat) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR /* Interrupts are off on entry. */ swapgs @@ -305,7 +305,7 @@ SYM_CODE_END(entry_SYSCALL_compat) * ebp arg6 */ SYM_CODE_START(entry_INT80_compat) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR /* * Interrupts are off on entry. diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 05dd75478d7b..bba42bd78edf 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -81,6 +81,17 @@ */ #define ANNOTATE_UNRET_SAFE ANNOTATE_RETPOLINE_SAFE +/* + * Abuse ANNOTATE_RETPOLINE_SAFE on a NOP to indicate UNRET_END, should + * eventually turn into it's own annotation. + */ +.macro ANNOTATE_UNRET_END +#ifdef CONFIG_DEBUG_ENTRY + ANNOTATE_RETPOLINE_SAFE + nop +#endif +.endm + /* * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple * indirect jmp/call which may be susceptible to the Spectre variant 2 @@ -131,6 +142,7 @@ */ .macro UNTRAIN_RET #ifdef CONFIG_RETPOLINE + ANNOTATE_UNRET_END ALTERNATIVE_2 "", \ "call zen_untrain_ret", X86_FEATURE_UNRET, \ "call entry_ibpb", X86_FEATURE_ENTRY_IBPB diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index 8b33674288ea..6f70fe4c93f2 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -11,6 +11,10 @@ UNWIND_HINT sp_reg=ORC_REG_UNDEFINED type=UNWIND_HINT_TYPE_CALL end=1 .endm +.macro UNWIND_HINT_ENTRY + UNWIND_HINT sp_reg=ORC_REG_UNDEFINED type=UNWIND_HINT_TYPE_ENTRY end=1 +.endm + .macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0 .if \base == %rsp .if \indirect diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 92c4afa2b729..d860d437631b 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -389,6 +389,8 @@ SYM_CODE_START_NOALIGN(vc_boot_ghcb) UNWIND_HINT_IRET_REGS offset=8 ENDBR + ANNOTATE_UNRET_END + /* Build pt_regs */ PUSH_AND_CLEAR_REGS @@ -448,6 +450,7 @@ SYM_CODE_END(early_idt_handler_array) SYM_CODE_START_LOCAL(early_idt_handler_common) UNWIND_HINT_IRET_REGS offset=16 + ANNOTATE_UNRET_END /* * The stack is the hardware frame, an error code or zero, and the * vector number. @@ -497,6 +500,8 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb) UNWIND_HINT_IRET_REGS offset=8 ENDBR + ANNOTATE_UNRET_END + /* Build pt_regs */ PUSH_AND_CLEAR_REGS diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 6bf9d45b9178..6b4fdf6b9542 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -121,7 +121,7 @@ SYM_FUNC_END(xen_read_cr2_direct); .macro xen_pv_trap name SYM_CODE_START(xen_\name) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR pop %rcx pop %r11 @@ -235,7 +235,7 @@ SYM_CODE_END(xenpv_restore_regs_and_return_to_usermode) /* Normal 64-bit system call target */ SYM_CODE_START(xen_entry_SYSCALL_64) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR popq %rcx popq %r11 @@ -255,7 +255,7 @@ SYM_CODE_END(xen_entry_SYSCALL_64) /* 32-bit compat syscall target */ SYM_CODE_START(xen_entry_SYSCALL_compat) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR popq %rcx popq %r11 @@ -273,7 +273,7 @@ SYM_CODE_END(xen_entry_SYSCALL_compat) /* 32-bit compat sysenter target */ SYM_CODE_START(xen_entry_SYSENTER_compat) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR /* * NB: Xen is polite and clears TF from EFLAGS for us. This means @@ -297,7 +297,7 @@ SYM_CODE_END(xen_entry_SYSENTER_compat) SYM_CODE_START(xen_entry_SYSCALL_compat) SYM_CODE_START(xen_entry_SYSENTER_compat) - UNWIND_HINT_EMPTY + UNWIND_HINT_ENTRY ENDBR lea 16(%rsp), %rsp /* strip %rcx, %r11 */ mov $-ENOSYS, %rax diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 15b940ec1eac..b026f1ae39c6 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -32,11 +32,14 @@ struct unwind_hint { * * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function. * Useful for code which doesn't have an ELF function annotation. + * + * UNWIND_HINT_ENTRY: machine entry without stack, SYSCALL/SYSENTER etc. */ #define UNWIND_HINT_TYPE_CALL 0 #define UNWIND_HINT_TYPE_REGS 1 #define UNWIND_HINT_TYPE_REGS_PARTIAL 2 #define UNWIND_HINT_TYPE_FUNC 3 +#define UNWIND_HINT_TYPE_ENTRY 4 #ifdef CONFIG_OBJTOOL diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o index 3c97a1564947..bc67748044a6 100644 --- a/scripts/Makefile.vmlinux_o +++ b/scripts/Makefile.vmlinux_o @@ -44,7 +44,7 @@ objtool-enabled := $(or $(delay-objtool),$(CONFIG_NOINSTR_VALIDATION)) objtool_args := \ $(if $(delay-objtool),$(objtool_args)) \ - $(if $(CONFIG_NOINSTR_VALIDATION), --noinstr) \ + $(if $(CONFIG_NOINSTR_VALIDATION), --noinstr $(if $(CONFIG_RETPOLINE), --unret)) \ $(if $(CONFIG_GCOV_KERNEL), --no-unreachable) \ --link diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h index 15b940ec1eac..b026f1ae39c6 100644 --- a/tools/include/linux/objtool.h +++ b/tools/include/linux/objtool.h @@ -32,11 +32,14 @@ struct unwind_hint { * * UNWIND_HINT_FUNC: Generate the unwind metadata of a callable function. * Useful for code which doesn't have an ELF function annotation. + * + * UNWIND_HINT_ENTRY: machine entry without stack, SYSCALL/SYSENTER etc. */ #define UNWIND_HINT_TYPE_CALL 0 #define UNWIND_HINT_TYPE_REGS 1 #define UNWIND_HINT_TYPE_REGS_PARTIAL 2 #define UNWIND_HINT_TYPE_FUNC 3 +#define UNWIND_HINT_TYPE_ENTRY 4 #ifdef CONFIG_OBJTOOL diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index f4c3a5091737..c063e1ff96b2 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -68,6 +68,7 @@ const struct option check_options[] = { OPT_BOOLEAN('n', "noinstr", &opts.noinstr, "validate noinstr rules"), OPT_BOOLEAN('o', "orc", &opts.orc, "generate ORC metadata"), OPT_BOOLEAN('r', "retpoline", &opts.retpoline, "validate and annotate retpoline usage"), + OPT_BOOLEAN(0, "unret", &opts.unret, "validate entry unret placement"), OPT_BOOLEAN('l', "sls", &opts.sls, "validate straight-line-speculation mitigations"), OPT_BOOLEAN('s', "stackval", &opts.stackval, "validate frame pointer rules"), OPT_BOOLEAN('t', "static-call", &opts.static_call, "annotate static calls"), @@ -163,6 +164,11 @@ static bool link_opts_valid(struct objtool_file *file) return false; } + if (opts.unret) { + ERROR("--unret requires --link"); + return false; + } + return true; } diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 7dc378156a63..822a490e6d87 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2032,16 +2032,24 @@ static int read_unwind_hints(struct objtool_file *file) insn->hint = true; - if (opts.ibt && hint->type == UNWIND_HINT_TYPE_REGS_PARTIAL) { + if (hint->type == UNWIND_HINT_TYPE_REGS_PARTIAL) { struct symbol *sym = find_symbol_by_offset(insn->sec, insn->offset); - if (sym && sym->bind == STB_GLOBAL && - insn->type != INSN_ENDBR && !insn->noendbr) { - WARN_FUNC("UNWIND_HINT_IRET_REGS without ENDBR", - insn->sec, insn->offset); + if (sym && sym->bind == STB_GLOBAL) { + if (opts.ibt && insn->type != INSN_ENDBR && !insn->noendbr) { + WARN_FUNC("UNWIND_HINT_IRET_REGS without ENDBR", + insn->sec, insn->offset); + } + + insn->entry = 1; } } + if (hint->type == UNWIND_HINT_TYPE_ENTRY) { + hint->type = UNWIND_HINT_TYPE_CALL; + insn->entry = 1; + } + if (hint->type == UNWIND_HINT_TYPE_FUNC) { insn->cfi = &func_cfi; continue; @@ -2116,8 +2124,9 @@ static int read_retpoline_hints(struct objtool_file *file) if (insn->type != INSN_JUMP_DYNAMIC && insn->type != INSN_CALL_DYNAMIC && - insn->type != INSN_RETURN) { - WARN_FUNC("retpoline_safe hint not an indirect jump/call/ret", + insn->type != INSN_RETURN && + insn->type != INSN_NOP) { + WARN_FUNC("retpoline_safe hint not an indirect jump/call/ret/nop", insn->sec, insn->offset); return -1; } @@ -3305,8 +3314,8 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, return 1; } - visited = 1 << state.uaccess; - if (insn->visited) { + visited = VISITED_BRANCH << state.uaccess; + if (insn->visited & VISITED_BRANCH_MASK) { if (!insn->hint && !insn_cfi_match(insn, &state.cfi)) return 1; @@ -3520,6 +3529,145 @@ static int validate_unwind_hints(struct objtool_file *file, struct section *sec) return warnings; } +/* + * Validate rethunk entry constraint: must untrain RET before the first RET. + * + * Follow every branch (intra-function) and ensure ANNOTATE_UNRET_END comes + * before an actual RET instruction. + */ +static int validate_entry(struct objtool_file *file, struct instruction *insn) +{ + struct instruction *next, *dest; + int ret, warnings = 0; + + for (;;) { + next = next_insn_to_validate(file, insn); + + if (insn->visited & VISITED_ENTRY) + return 0; + + insn->visited |= VISITED_ENTRY; + + if (!insn->ignore_alts && !list_empty(&insn->alts)) { + struct alternative *alt; + bool skip_orig = false; + + list_for_each_entry(alt, &insn->alts, list) { + if (alt->skip_orig) + skip_orig = true; + + ret = validate_entry(file, alt->insn); + if (ret) { + if (opts.backtrace) + BT_FUNC("(alt)", insn); + return ret; + } + } + + if (skip_orig) + return 0; + } + + switch (insn->type) { + + case INSN_CALL_DYNAMIC: + case INSN_JUMP_DYNAMIC: + case INSN_JUMP_DYNAMIC_CONDITIONAL: + WARN_FUNC("early indirect call", insn->sec, insn->offset); + return 1; + + case INSN_JUMP_UNCONDITIONAL: + case INSN_JUMP_CONDITIONAL: + if (!is_sibling_call(insn)) { + if (!insn->jump_dest) { + WARN_FUNC("unresolved jump target after linking?!?", + insn->sec, insn->offset); + return -1; + } + ret = validate_entry(file, insn->jump_dest); + if (ret) { + if (opts.backtrace) { + BT_FUNC("(branch%s)", insn, + insn->type == INSN_JUMP_CONDITIONAL ? "-cond" : ""); + } + return ret; + } + + if (insn->type == INSN_JUMP_UNCONDITIONAL) + return 0; + + break; + } + + /* fallthrough */ + case INSN_CALL: + dest = find_insn(file, insn->call_dest->sec, + insn->call_dest->offset); + if (!dest) { + WARN("Unresolved function after linking!?: %s", + insn->call_dest->name); + return -1; + } + + ret = validate_entry(file, dest); + if (ret) { + if (opts.backtrace) + BT_FUNC("(call)", insn); + return ret; + } + /* + * If a call returns without error, it must have seen UNTRAIN_RET. + * Therefore any non-error return is a success. + */ + return 0; + + case INSN_RETURN: + WARN_FUNC("RET before UNTRAIN", insn->sec, insn->offset); + return 1; + + case INSN_NOP: + if (insn->retpoline_safe) + return 0; + break; + + default: + break; + } + + if (!next) { + WARN_FUNC("teh end!", insn->sec, insn->offset); + return -1; + } + insn = next; + } + + return warnings; +} + +/* + * Validate that all branches starting at 'insn->entry' encounter UNRET_END + * before RET. + */ +static int validate_unret(struct objtool_file *file) +{ + struct instruction *insn; + int ret, warnings = 0; + + for_each_insn(file, insn) { + if (!insn->entry) + continue; + + ret = validate_entry(file, insn); + if (ret < 0) { + WARN_FUNC("Failed UNRET validation", insn->sec, insn->offset); + return ret; + } + warnings += ret; + } + + return warnings; +} + static int validate_retpoline(struct objtool_file *file) { struct instruction *insn; @@ -4039,6 +4187,17 @@ int check(struct objtool_file *file) warnings += ret; } + if (opts.unret) { + /* + * Must be after validate_branch() and friends, it plays + * further games with insn->visited. + */ + ret = validate_unret(file); + if (ret < 0) + return ret; + warnings += ret; + } + if (opts.ibt) { ret = validate_ibt(file); if (ret < 0) diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index 280ea18b7f2b..0c476b0b40a3 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -19,6 +19,7 @@ struct opts { bool noinstr; bool orc; bool retpoline; + bool unret; bool sls; bool stackval; bool static_call; diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h index f10d7374f388..0eeedeacbefb 100644 --- a/tools/objtool/include/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -51,8 +51,10 @@ struct instruction { ignore_alts : 1, hint : 1, retpoline_safe : 1, - noendbr : 1; - /* 2 bit hole */ + noendbr : 1, + entry : 1; + /* 1 bit hole */ + s8 instr; u8 visited; /* u8 hole */ @@ -69,6 +71,11 @@ struct instruction { struct cfi_state *cfi; }; +#define VISITED_BRANCH 0x01 +#define VISITED_BRANCH_UACCESS 0x02 +#define VISITED_BRANCH_MASK 0x03 +#define VISITED_ENTRY 0x04 + static inline bool is_static_jump(struct instruction *insn) { return insn->type == INSN_JUMP_CONDITIONAL || -- cgit From 8faea26e611189e933ea2281975ff4dc7c1106b6 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 24 Jun 2022 12:52:40 +0200 Subject: objtool: Re-add UNWIND_HINT_{SAVE_RESTORE} Commit c536ed2fffd5 ("objtool: Remove SAVE/RESTORE hints") removed the save/restore unwind hints because they were no longer needed. Now they're going to be needed again so re-add them. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov --- arch/x86/include/asm/unwind_hints.h | 12 +++++++++-- include/linux/objtool.h | 6 ++++-- tools/include/linux/objtool.h | 6 ++++-- tools/objtool/check.c | 40 +++++++++++++++++++++++++++++++++++ tools/objtool/include/objtool/check.h | 19 +++++++++-------- 5 files changed, 68 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index 6f70fe4c93f2..f66fbe6537dd 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -8,11 +8,11 @@ #ifdef __ASSEMBLY__ .macro UNWIND_HINT_EMPTY - UNWIND_HINT sp_reg=ORC_REG_UNDEFINED type=UNWIND_HINT_TYPE_CALL end=1 + UNWIND_HINT type=UNWIND_HINT_TYPE_CALL end=1 .endm .macro UNWIND_HINT_ENTRY - UNWIND_HINT sp_reg=ORC_REG_UNDEFINED type=UNWIND_HINT_TYPE_ENTRY end=1 + UNWIND_HINT type=UNWIND_HINT_TYPE_ENTRY end=1 .endm .macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 partial=0 @@ -56,6 +56,14 @@ UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=8 type=UNWIND_HINT_TYPE_FUNC .endm +.macro UNWIND_HINT_SAVE + UNWIND_HINT type=UNWIND_HINT_TYPE_SAVE +.endm + +.macro UNWIND_HINT_RESTORE + UNWIND_HINT type=UNWIND_HINT_TYPE_RESTORE +.endm + #else #define UNWIND_HINT_FUNC \ diff --git a/include/linux/objtool.h b/include/linux/objtool.h index b026f1ae39c6..10bc88cc3bf6 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -40,6 +40,8 @@ struct unwind_hint { #define UNWIND_HINT_TYPE_REGS_PARTIAL 2 #define UNWIND_HINT_TYPE_FUNC 3 #define UNWIND_HINT_TYPE_ENTRY 4 +#define UNWIND_HINT_TYPE_SAVE 5 +#define UNWIND_HINT_TYPE_RESTORE 6 #ifdef CONFIG_OBJTOOL @@ -127,7 +129,7 @@ struct unwind_hint { * the debuginfo as necessary. It will also warn if it sees any * inconsistencies. */ -.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0 +.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0 .Lunwind_hint_ip_\@: .pushsection .discard.unwind_hints /* struct unwind_hint */ @@ -180,7 +182,7 @@ struct unwind_hint { #define ASM_REACHABLE #else #define ANNOTATE_INTRA_FUNCTION_CALL -.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0 +.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0 .endm .macro STACK_FRAME_NON_STANDARD func:req .endm diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h index b026f1ae39c6..10bc88cc3bf6 100644 --- a/tools/include/linux/objtool.h +++ b/tools/include/linux/objtool.h @@ -40,6 +40,8 @@ struct unwind_hint { #define UNWIND_HINT_TYPE_REGS_PARTIAL 2 #define UNWIND_HINT_TYPE_FUNC 3 #define UNWIND_HINT_TYPE_ENTRY 4 +#define UNWIND_HINT_TYPE_SAVE 5 +#define UNWIND_HINT_TYPE_RESTORE 6 #ifdef CONFIG_OBJTOOL @@ -127,7 +129,7 @@ struct unwind_hint { * the debuginfo as necessary. It will also warn if it sees any * inconsistencies. */ -.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0 +.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0 .Lunwind_hint_ip_\@: .pushsection .discard.unwind_hints /* struct unwind_hint */ @@ -180,7 +182,7 @@ struct unwind_hint { #define ASM_REACHABLE #else #define ANNOTATE_INTRA_FUNCTION_CALL -.macro UNWIND_HINT sp_reg:req sp_offset=0 type:req end=0 +.macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 end=0 .endm .macro STACK_FRAME_NON_STANDARD func:req .endm diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 822a490e6d87..ddfdd138cc2a 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2032,6 +2032,17 @@ static int read_unwind_hints(struct objtool_file *file) insn->hint = true; + if (hint->type == UNWIND_HINT_TYPE_SAVE) { + insn->hint = false; + insn->save = true; + continue; + } + + if (hint->type == UNWIND_HINT_TYPE_RESTORE) { + insn->restore = true; + continue; + } + if (hint->type == UNWIND_HINT_TYPE_REGS_PARTIAL) { struct symbol *sym = find_symbol_by_offset(insn->sec, insn->offset); @@ -3329,6 +3340,35 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, state.instr += insn->instr; if (insn->hint) { + if (insn->restore) { + struct instruction *save_insn, *i; + + i = insn; + save_insn = NULL; + + sym_for_each_insn_continue_reverse(file, func, i) { + if (i->save) { + save_insn = i; + break; + } + } + + if (!save_insn) { + WARN_FUNC("no corresponding CFI save for CFI restore", + sec, insn->offset); + return 1; + } + + if (!save_insn->visited) { + WARN_FUNC("objtool isn't smart enough to handle this CFI save/restore combo", + sec, insn->offset); + return 1; + } + + insn->cfi = save_insn->cfi; + nr_cfi_reused++; + } + state.cfi = *insn->cfi; } else { /* XXX track if we actually changed state.cfi */ diff --git a/tools/objtool/include/objtool/check.h b/tools/objtool/include/objtool/check.h index 0eeedeacbefb..036129cebeee 100644 --- a/tools/objtool/include/objtool/check.h +++ b/tools/objtool/include/objtool/check.h @@ -46,18 +46,19 @@ struct instruction { enum insn_type type; unsigned long immediate; - u8 dead_end : 1, - ignore : 1, - ignore_alts : 1, - hint : 1, - retpoline_safe : 1, - noendbr : 1, - entry : 1; - /* 1 bit hole */ + u16 dead_end : 1, + ignore : 1, + ignore_alts : 1, + hint : 1, + save : 1, + restore : 1, + retpoline_safe : 1, + noendbr : 1, + entry : 1; + /* 7 bit hole */ s8 instr; u8 visited; - /* u8 hole */ struct alt_group *alt_group; struct symbol *call_dest; -- cgit From 8698e3bab4dd7968666e84e111d0bfd17c040e77 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 27 Jun 2022 20:47:19 +0300 Subject: fanotify: refine the validation checks on non-dir inode mask Commit ceaf69f8eadc ("fanotify: do not allow setting dirent events in mask of non-dir") added restrictions about setting dirent events in the mask of a non-dir inode mark, which does not make any sense. For backward compatibility, these restictions were added only to new (v5.17+) APIs. It also does not make any sense to set the flags FAN_EVENT_ON_CHILD or FAN_ONDIR in the mask of a non-dir inode. Add these flags to the dir-only restriction of the new APIs as well. Move the check of the dir-only flags for new APIs into the helper fanotify_events_supported(), which is only called for FAN_MARK_ADD, because there is no need to error on an attempt to remove the dir-only flags from non-dir inode. Fixes: ceaf69f8eadc ("fanotify: do not allow setting dirent events in mask of non-dir") Link: https://lore.kernel.org/linux-fsdevel/20220627113224.kr2725conevh53u4@quack3.lan/ Link: https://lore.kernel.org/r/20220627174719.2838175-1-amir73il@gmail.com Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify_user.c | 34 +++++++++++++++++++--------------- include/linux/fanotify.h | 4 ++++ 2 files changed, 23 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index c2255b440df9..b08ce0d821a7 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1513,8 +1513,15 @@ static int fanotify_test_fid(struct dentry *dentry) return 0; } -static int fanotify_events_supported(struct path *path, __u64 mask) +static int fanotify_events_supported(struct fsnotify_group *group, + struct path *path, __u64 mask, + unsigned int flags) { + unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; + /* Strict validation of events in non-dir inode mask with v5.17+ APIs */ + bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) || + (mask & FAN_RENAME); + /* * Some filesystems such as 'proc' acquire unusual locks when opening * files. For them fanotify permission events have high chances of @@ -1526,6 +1533,16 @@ static int fanotify_events_supported(struct path *path, __u64 mask) if (mask & FANOTIFY_PERM_EVENTS && path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM) return -EINVAL; + + /* + * We shouldn't have allowed setting dirent events and the directory + * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode, + * but because we always allowed it, error only when using new APIs. + */ + if (strict_dir_events && mark_type == FAN_MARK_INODE && + !d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS)) + return -ENOTDIR; + return 0; } @@ -1672,7 +1689,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, goto fput_and_out; if (flags & FAN_MARK_ADD) { - ret = fanotify_events_supported(&path, mask); + ret = fanotify_events_supported(group, &path, mask, flags); if (ret) goto path_put_and_out; } @@ -1695,19 +1712,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, else mnt = path.mnt; - /* - * FAN_RENAME is not allowed on non-dir (for now). - * We shouldn't have allowed setting any dirent events in mask of - * non-dir, but because we always allowed it, error only if group - * was initialized with the new flag FAN_REPORT_TARGET_FID. - */ - ret = -ENOTDIR; - if (inode && !S_ISDIR(inode->i_mode) && - ((mask & FAN_RENAME) || - ((mask & FANOTIFY_DIRENT_EVENTS) && - FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID)))) - goto path_put_and_out; - /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ if (mnt || !S_ISDIR(inode->i_mode)) { mask &= ~FAN_EVENT_ON_CHILD; diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index edc28555814c..e517dbcf74ed 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -111,6 +111,10 @@ FANOTIFY_PERM_EVENTS | \ FAN_Q_OVERFLOW | FAN_ONDIR) +/* Events and flags relevant only for directories */ +#define FANOTIFY_DIRONLY_EVENT_BITS (FANOTIFY_DIRENT_EVENTS | \ + FAN_EVENT_ON_CHILD | FAN_ONDIR) + #define ALL_FANOTIFY_EVENT_BITS (FANOTIFY_OUTGOING_EVENTS | \ FANOTIFY_EVENT_FLAGS) -- cgit From 0fe3dbbefb74a8575f61d7801b08dbc50523d60d Mon Sep 17 00:00:00 2001 From: Tao Liu Date: Mon, 27 Jun 2022 22:00:04 +0800 Subject: linux/dim: Fix divide by 0 in RDMA DIM Fix a divide 0 error in rdma_dim_stats_compare() when prev->cpe_ratio == 0. CallTrace: Hardware name: H3C R4900 G3/RS33M2C9S, BIOS 2.00.37P21 03/12/2020 task: ffff880194b78000 task.stack: ffffc90006714000 RIP: 0010:backport_rdma_dim+0x10e/0x240 [mlx_compat] RSP: 0018:ffff880c10e83ec0 EFLAGS: 00010202 RAX: 0000000000002710 RBX: ffff88096cd7f780 RCX: 0000000000000064 RDX: 0000000000000000 RSI: 0000000000000002 RDI: 0000000000000001 RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 000000001d7c6c09 R13: ffff88096cd7f780 R14: ffff880b174fe800 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff880c10e80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000a0965b00 CR3: 000000000200a003 CR4: 00000000007606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ib_poll_handler+0x43/0x80 [ib_core] irq_poll_softirq+0xae/0x110 __do_softirq+0xd1/0x28c irq_exit+0xde/0xf0 do_IRQ+0x54/0xe0 common_interrupt+0x8f/0x8f ? cpuidle_enter_state+0xd9/0x2a0 ? cpuidle_enter_state+0xc7/0x2a0 ? do_idle+0x170/0x1d0 ? cpu_startup_entry+0x6f/0x80 ? start_secondary+0x1b9/0x210 ? secondary_startup_64+0xa5/0xb0 Code: 0f 87 e1 00 00 00 8b 4c 24 14 44 8b 43 14 89 c8 4d 63 c8 44 29 c0 99 31 d0 29 d0 31 d2 48 98 48 8d 04 80 48 8d 04 80 48 c1 e0 02 <49> f7 f1 48 83 f8 0a 0f 86 c1 00 00 00 44 39 c1 7f 10 48 89 df RIP: backport_rdma_dim+0x10e/0x240 [mlx_compat] RSP: ffff880c10e83ec0 Fixes: f4915455dcf0 ("linux/dim: Implement RDMA adaptive moderation (DIM)") Link: https://lore.kernel.org/r/20220627140004.3099-1-thomas.liu@ucloud.cn Signed-off-by: Tao Liu Reviewed-by: Max Gurtovoy Acked-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/dim.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dim.h b/include/linux/dim.h index b698266d0035..6c5733981563 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -21,7 +21,7 @@ * We consider 10% difference as significant. */ #define IS_SIGNIFICANT_DIFF(val, ref) \ - (((100UL * abs((val) - (ref))) / (ref)) > 10) + ((ref) && (((100UL * abs((val) - (ref))) / (ref)) > 10)) /* * Calculate the gap between two values. -- cgit From 9e121040e54abef9ed5542e5fdfa87911cd96204 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Tue, 7 Jun 2022 20:23:34 +0200 Subject: firmware: sysfb: Make sysfb_create_simplefb() return a pdev pointer This function just returned 0 on success or an errno code on error, but it could be useful for sysfb_init() callers to have a pointer to the device. Signed-off-by: Javier Martinez Canillas Reviewed-by: Daniel Vetter Reviewed-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/20220607182338.344270-2-javierm@redhat.com --- drivers/firmware/sysfb.c | 4 ++-- drivers/firmware/sysfb_simplefb.c | 16 ++++++++-------- include/linux/sysfb.h | 10 +++++----- 3 files changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/sysfb.c b/drivers/firmware/sysfb.c index 2bfbb05f7d89..b032f40a92de 100644 --- a/drivers/firmware/sysfb.c +++ b/drivers/firmware/sysfb.c @@ -46,8 +46,8 @@ static __init int sysfb_init(void) /* try to create a simple-framebuffer device */ compatible = sysfb_parse_mode(si, &mode); if (compatible) { - ret = sysfb_create_simplefb(si, &mode); - if (!ret) + pd = sysfb_create_simplefb(si, &mode); + if (!IS_ERR(pd)) return 0; } diff --git a/drivers/firmware/sysfb_simplefb.c b/drivers/firmware/sysfb_simplefb.c index bda8712bfd8c..a353e27f83f5 100644 --- a/drivers/firmware/sysfb_simplefb.c +++ b/drivers/firmware/sysfb_simplefb.c @@ -57,8 +57,8 @@ __init bool sysfb_parse_mode(const struct screen_info *si, return false; } -__init int sysfb_create_simplefb(const struct screen_info *si, - const struct simplefb_platform_data *mode) +__init struct platform_device *sysfb_create_simplefb(const struct screen_info *si, + const struct simplefb_platform_data *mode) { struct platform_device *pd; struct resource res; @@ -76,7 +76,7 @@ __init int sysfb_create_simplefb(const struct screen_info *si, base |= (u64)si->ext_lfb_base << 32; if (!base || (u64)(resource_size_t)base != base) { printk(KERN_DEBUG "sysfb: inaccessible VRAM base\n"); - return -EINVAL; + return ERR_PTR(-EINVAL); } /* @@ -93,7 +93,7 @@ __init int sysfb_create_simplefb(const struct screen_info *si, length = mode->height * mode->stride; if (length > size) { printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n"); - return -EINVAL; + return ERR_PTR(-EINVAL); } length = PAGE_ALIGN(length); @@ -104,11 +104,11 @@ __init int sysfb_create_simplefb(const struct screen_info *si, res.start = base; res.end = res.start + length - 1; if (res.end <= res.start) - return -EINVAL; + return ERR_PTR(-EINVAL); pd = platform_device_alloc("simple-framebuffer", 0); if (!pd) - return -ENOMEM; + return ERR_PTR(-ENOMEM); sysfb_apply_efi_quirks(pd); @@ -124,10 +124,10 @@ __init int sysfb_create_simplefb(const struct screen_info *si, if (ret) goto err_put_device; - return 0; + return pd; err_put_device: platform_device_put(pd); - return ret; + return ERR_PTR(ret); } diff --git a/include/linux/sysfb.h b/include/linux/sysfb.h index b0dcfa26d07b..708152e9037b 100644 --- a/include/linux/sysfb.h +++ b/include/linux/sysfb.h @@ -72,8 +72,8 @@ static inline void sysfb_apply_efi_quirks(struct platform_device *pd) bool sysfb_parse_mode(const struct screen_info *si, struct simplefb_platform_data *mode); -int sysfb_create_simplefb(const struct screen_info *si, - const struct simplefb_platform_data *mode); +struct platform_device *sysfb_create_simplefb(const struct screen_info *si, + const struct simplefb_platform_data *mode); #else /* CONFIG_SYSFB_SIMPLE */ @@ -83,10 +83,10 @@ static inline bool sysfb_parse_mode(const struct screen_info *si, return false; } -static inline int sysfb_create_simplefb(const struct screen_info *si, - const struct simplefb_platform_data *mode) +static inline struct platform_device *sysfb_create_simplefb(const struct screen_info *si, + const struct simplefb_platform_data *mode) { - return -EINVAL; + return ERR_PTR(-EINVAL); } #endif /* CONFIG_SYSFB_SIMPLE */ -- cgit From bde376e9de3c0bc55eedc8956b0f114c05531595 Mon Sep 17 00:00:00 2001 From: Javier Martinez Canillas Date: Tue, 7 Jun 2022 20:23:35 +0200 Subject: firmware: sysfb: Add sysfb_disable() helper function This can be used by subsystems to unregister a platform device registered by sysfb and also to disable future platform device registration in sysfb. Suggested-by: Daniel Vetter Signed-off-by: Javier Martinez Canillas Reviewed-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20220607182338.344270-3-javierm@redhat.com --- .../driver-api/firmware/other_interfaces.rst | 6 +++ drivers/firmware/sysfb.c | 54 +++++++++++++++++++--- include/linux/sysfb.h | 12 +++++ 3 files changed, 66 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/Documentation/driver-api/firmware/other_interfaces.rst b/Documentation/driver-api/firmware/other_interfaces.rst index b81794e0cfbb..06ac89adaafb 100644 --- a/Documentation/driver-api/firmware/other_interfaces.rst +++ b/Documentation/driver-api/firmware/other_interfaces.rst @@ -13,6 +13,12 @@ EDD Interfaces .. kernel-doc:: drivers/firmware/edd.c :internal: +Generic System Framebuffers Interface +------------------------------------- + +.. kernel-doc:: drivers/firmware/sysfb.c + :export: + Intel Stratix10 SoC Service Layer --------------------------------- Some features of the Intel Stratix10 SoC require a level of privilege diff --git a/drivers/firmware/sysfb.c b/drivers/firmware/sysfb.c index b032f40a92de..1f276f108cc9 100644 --- a/drivers/firmware/sysfb.c +++ b/drivers/firmware/sysfb.c @@ -34,21 +34,59 @@ #include #include +static struct platform_device *pd; +static DEFINE_MUTEX(disable_lock); +static bool disabled; + +static bool sysfb_unregister(void) +{ + if (IS_ERR_OR_NULL(pd)) + return false; + + platform_device_unregister(pd); + pd = NULL; + + return true; +} + +/** + * sysfb_disable() - disable the Generic System Framebuffers support + * + * This disables the registration of system framebuffer devices that match the + * generic drivers that make use of the system framebuffer set up by firmware. + * + * It also unregisters a device if this was already registered by sysfb_init(). + * + * Context: The function can sleep. A @disable_lock mutex is acquired to serialize + * against sysfb_init(), that registers a system framebuffer device. + */ +void sysfb_disable(void) +{ + mutex_lock(&disable_lock); + sysfb_unregister(); + disabled = true; + mutex_unlock(&disable_lock); +} +EXPORT_SYMBOL_GPL(sysfb_disable); + static __init int sysfb_init(void) { struct screen_info *si = &screen_info; struct simplefb_platform_data mode; - struct platform_device *pd; const char *name; bool compatible; - int ret; + int ret = 0; + + mutex_lock(&disable_lock); + if (disabled) + goto unlock_mutex; /* try to create a simple-framebuffer device */ compatible = sysfb_parse_mode(si, &mode); if (compatible) { pd = sysfb_create_simplefb(si, &mode); if (!IS_ERR(pd)) - return 0; + goto unlock_mutex; } /* if the FB is incompatible, create a legacy framebuffer device */ @@ -60,8 +98,10 @@ static __init int sysfb_init(void) name = "platform-framebuffer"; pd = platform_device_alloc(name, 0); - if (!pd) - return -ENOMEM; + if (!pd) { + ret = -ENOMEM; + goto unlock_mutex; + } sysfb_apply_efi_quirks(pd); @@ -73,9 +113,11 @@ static __init int sysfb_init(void) if (ret) goto err; - return 0; + goto unlock_mutex; err: platform_device_put(pd); +unlock_mutex: + mutex_unlock(&disable_lock); return ret; } diff --git a/include/linux/sysfb.h b/include/linux/sysfb.h index 708152e9037b..8ba8b5be5567 100644 --- a/include/linux/sysfb.h +++ b/include/linux/sysfb.h @@ -55,6 +55,18 @@ struct efifb_dmi_info { int flags; }; +#ifdef CONFIG_SYSFB + +void sysfb_disable(void); + +#else /* CONFIG_SYSFB */ + +static inline void sysfb_disable(void) +{ +} + +#endif /* CONFIG_SYSFB */ + #ifdef CONFIG_EFI extern struct efifb_dmi_info efifb_dmi_list[]; -- cgit From a57f68ddc8865d59a19783080cc52fb4a11dc209 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 24 Jun 2022 17:18:45 +0300 Subject: reset: Fix devm bulk optional exclusive control getter Most likely due to copy-paste mistake the device managed version of the denoted reset control getter has been implemented with invalid semantic, which can be immediately spotted by having "WARN_ON(shared && acquired)" warning in the system log as soon as the method is called. Anyway let's fix it by altering the boolean arguments passed to the __devm_reset_control_bulk_get() method from - shared = true, optional = false, acquired = true to + shared = false, optional = true, acquired = true That's what they were supposed to be in the first place (see the non-devm version of the same method: reset_control_bulk_get_optional_exclusive()). Fixes: 48d71395896d ("reset: Add reset_control_bulk API") Signed-off-by: Serge Semin Reviewed-by: Dmitry Osipenko Signed-off-by: Philipp Zabel Link: https://lore.kernel.org/r/20220624141853.7417-2-Sergey.Semin@baikalelectronics.ru --- include/linux/reset.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/reset.h b/include/linux/reset.h index 8a21b5756c3e..514ddf003efc 100644 --- a/include/linux/reset.h +++ b/include/linux/reset.h @@ -731,7 +731,7 @@ static inline int __must_check devm_reset_control_bulk_get_optional_exclusive(struct device *dev, int num_rstcs, struct reset_control_bulk_data *rstcs) { - return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, true, false, true); + return __devm_reset_control_bulk_get(dev, num_rstcs, rstcs, false, true, true); } /** -- cgit From b5d281f6c16dd432b618bdfd36ddba1a58d5b603 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Mon, 20 Jun 2022 00:03:51 +0200 Subject: PM / devfreq: Rework freq_table to be local to devfreq struct On a devfreq PROBE_DEFER, the freq_table in the driver profile struct, is never reset and may be leaved in an undefined state. This comes from the fact that we store the freq_table in the driver profile struct that is commonly defined as static and not reset on PROBE_DEFER. We currently skip the reinit of the freq_table if we found it's already defined since a driver may declare his own freq_table. This logic is flawed in the case devfreq core generate a freq_table, set it in the profile struct and then PROBE_DEFER, freeing the freq_table. In this case devfreq will found a NOT NULL freq_table that has been freed, skip the freq_table generation and probe the driver based on the wrong table. To fix this and correctly handle PROBE_DEFER, use a local freq_table and max_state in the devfreq struct and never modify the freq_table present in the profile struct if it does provide it. Fixes: 0ec09ac2cebe ("PM / devfreq: Set the freq_table of devfreq device") Cc: stable@vger.kernel.org Signed-off-by: Christian Marangi Signed-off-by: Chanwoo Choi --- drivers/devfreq/devfreq.c | 71 ++++++++++++++++++-------------------- drivers/devfreq/governor_passive.c | 14 ++++---- include/linux/devfreq.h | 5 +++ 3 files changed, 46 insertions(+), 44 deletions(-) (limited to 'include/linux') diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 80a1235ef8fb..9602141bb8ec 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -123,7 +123,7 @@ void devfreq_get_freq_range(struct devfreq *devfreq, unsigned long *min_freq, unsigned long *max_freq) { - unsigned long *freq_table = devfreq->profile->freq_table; + unsigned long *freq_table = devfreq->freq_table; s32 qos_min_freq, qos_max_freq; lockdep_assert_held(&devfreq->lock); @@ -133,11 +133,11 @@ void devfreq_get_freq_range(struct devfreq *devfreq, * The devfreq drivers can initialize this in either ascending or * descending order and devfreq core supports both. */ - if (freq_table[0] < freq_table[devfreq->profile->max_state - 1]) { + if (freq_table[0] < freq_table[devfreq->max_state - 1]) { *min_freq = freq_table[0]; - *max_freq = freq_table[devfreq->profile->max_state - 1]; + *max_freq = freq_table[devfreq->max_state - 1]; } else { - *min_freq = freq_table[devfreq->profile->max_state - 1]; + *min_freq = freq_table[devfreq->max_state - 1]; *max_freq = freq_table[0]; } @@ -169,8 +169,8 @@ static int devfreq_get_freq_level(struct devfreq *devfreq, unsigned long freq) { int lev; - for (lev = 0; lev < devfreq->profile->max_state; lev++) - if (freq == devfreq->profile->freq_table[lev]) + for (lev = 0; lev < devfreq->max_state; lev++) + if (freq == devfreq->freq_table[lev]) return lev; return -EINVAL; @@ -178,7 +178,6 @@ static int devfreq_get_freq_level(struct devfreq *devfreq, unsigned long freq) static int set_freq_table(struct devfreq *devfreq) { - struct devfreq_dev_profile *profile = devfreq->profile; struct dev_pm_opp *opp; unsigned long freq; int i, count; @@ -188,25 +187,22 @@ static int set_freq_table(struct devfreq *devfreq) if (count <= 0) return -EINVAL; - profile->max_state = count; - profile->freq_table = devm_kcalloc(devfreq->dev.parent, - profile->max_state, - sizeof(*profile->freq_table), - GFP_KERNEL); - if (!profile->freq_table) { - profile->max_state = 0; + devfreq->max_state = count; + devfreq->freq_table = devm_kcalloc(devfreq->dev.parent, + devfreq->max_state, + sizeof(*devfreq->freq_table), + GFP_KERNEL); + if (!devfreq->freq_table) return -ENOMEM; - } - for (i = 0, freq = 0; i < profile->max_state; i++, freq++) { + for (i = 0, freq = 0; i < devfreq->max_state; i++, freq++) { opp = dev_pm_opp_find_freq_ceil(devfreq->dev.parent, &freq); if (IS_ERR(opp)) { - devm_kfree(devfreq->dev.parent, profile->freq_table); - profile->max_state = 0; + devm_kfree(devfreq->dev.parent, devfreq->freq_table); return PTR_ERR(opp); } dev_pm_opp_put(opp); - profile->freq_table[i] = freq; + devfreq->freq_table[i] = freq; } return 0; @@ -246,7 +242,7 @@ int devfreq_update_status(struct devfreq *devfreq, unsigned long freq) if (lev != prev_lev) { devfreq->stats.trans_table[ - (prev_lev * devfreq->profile->max_state) + lev]++; + (prev_lev * devfreq->max_state) + lev]++; devfreq->stats.total_trans++; } @@ -835,6 +831,9 @@ struct devfreq *devfreq_add_device(struct device *dev, if (err < 0) goto err_dev; mutex_lock(&devfreq->lock); + } else { + devfreq->freq_table = devfreq->profile->freq_table; + devfreq->max_state = devfreq->profile->max_state; } devfreq->scaling_min_freq = find_available_min_freq(devfreq); @@ -870,8 +869,8 @@ struct devfreq *devfreq_add_device(struct device *dev, devfreq->stats.trans_table = devm_kzalloc(&devfreq->dev, array3_size(sizeof(unsigned int), - devfreq->profile->max_state, - devfreq->profile->max_state), + devfreq->max_state, + devfreq->max_state), GFP_KERNEL); if (!devfreq->stats.trans_table) { mutex_unlock(&devfreq->lock); @@ -880,7 +879,7 @@ struct devfreq *devfreq_add_device(struct device *dev, } devfreq->stats.time_in_state = devm_kcalloc(&devfreq->dev, - devfreq->profile->max_state, + devfreq->max_state, sizeof(*devfreq->stats.time_in_state), GFP_KERNEL); if (!devfreq->stats.time_in_state) { @@ -1666,9 +1665,9 @@ static ssize_t available_frequencies_show(struct device *d, mutex_lock(&df->lock); - for (i = 0; i < df->profile->max_state; i++) + for (i = 0; i < df->max_state; i++) count += scnprintf(&buf[count], (PAGE_SIZE - count - 2), - "%lu ", df->profile->freq_table[i]); + "%lu ", df->freq_table[i]); mutex_unlock(&df->lock); /* Truncate the trailing space */ @@ -1691,7 +1690,7 @@ static ssize_t trans_stat_show(struct device *dev, if (!df->profile) return -EINVAL; - max_state = df->profile->max_state; + max_state = df->max_state; if (max_state == 0) return sprintf(buf, "Not Supported.\n"); @@ -1708,19 +1707,17 @@ static ssize_t trans_stat_show(struct device *dev, len += sprintf(buf + len, " :"); for (i = 0; i < max_state; i++) len += sprintf(buf + len, "%10lu", - df->profile->freq_table[i]); + df->freq_table[i]); len += sprintf(buf + len, " time(ms)\n"); for (i = 0; i < max_state; i++) { - if (df->profile->freq_table[i] - == df->previous_freq) { + if (df->freq_table[i] == df->previous_freq) len += sprintf(buf + len, "*"); - } else { + else len += sprintf(buf + len, " "); - } - len += sprintf(buf + len, "%10lu:", - df->profile->freq_table[i]); + + len += sprintf(buf + len, "%10lu:", df->freq_table[i]); for (j = 0; j < max_state; j++) len += sprintf(buf + len, "%10u", df->stats.trans_table[(i * max_state) + j]); @@ -1744,7 +1741,7 @@ static ssize_t trans_stat_store(struct device *dev, if (!df->profile) return -EINVAL; - if (df->profile->max_state == 0) + if (df->max_state == 0) return count; err = kstrtoint(buf, 10, &value); @@ -1752,11 +1749,11 @@ static ssize_t trans_stat_store(struct device *dev, return -EINVAL; mutex_lock(&df->lock); - memset(df->stats.time_in_state, 0, (df->profile->max_state * + memset(df->stats.time_in_state, 0, (df->max_state * sizeof(*df->stats.time_in_state))); memset(df->stats.trans_table, 0, array3_size(sizeof(unsigned int), - df->profile->max_state, - df->profile->max_state)); + df->max_state, + df->max_state)); df->stats.total_trans = 0; df->stats.last_update = get_jiffies_64(); mutex_unlock(&df->lock); diff --git a/drivers/devfreq/governor_passive.c b/drivers/devfreq/governor_passive.c index 69e06725d92b..406ef79c0c46 100644 --- a/drivers/devfreq/governor_passive.c +++ b/drivers/devfreq/governor_passive.c @@ -144,18 +144,18 @@ static int get_target_freq_with_devfreq(struct devfreq *devfreq, goto out; /* Use interpolation if required opps is not available */ - for (i = 0; i < parent_devfreq->profile->max_state; i++) - if (parent_devfreq->profile->freq_table[i] == *freq) + for (i = 0; i < parent_devfreq->max_state; i++) + if (parent_devfreq->freq_table[i] == *freq) break; - if (i == parent_devfreq->profile->max_state) + if (i == parent_devfreq->max_state) return -EINVAL; - if (i < devfreq->profile->max_state) { - child_freq = devfreq->profile->freq_table[i]; + if (i < devfreq->max_state) { + child_freq = devfreq->freq_table[i]; } else { - count = devfreq->profile->max_state; - child_freq = devfreq->profile->freq_table[count - 1]; + count = devfreq->max_state; + child_freq = devfreq->freq_table[count - 1]; } out: diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h index dc10bee75a72..34aab4dd336c 100644 --- a/include/linux/devfreq.h +++ b/include/linux/devfreq.h @@ -148,6 +148,8 @@ struct devfreq_stats { * reevaluate operable frequencies. Devfreq users may use * devfreq.nb to the corresponding register notifier call chain. * @work: delayed work for load monitoring. + * @freq_table: current frequency table used by the devfreq driver. + * @max_state: count of entry present in the frequency table. * @previous_freq: previously configured frequency value. * @last_status: devfreq user device info, performance statistics * @data: Private data of the governor. The devfreq framework does not @@ -185,6 +187,9 @@ struct devfreq { struct notifier_block nb; struct delayed_work work; + unsigned long *freq_table; + unsigned int max_state; + unsigned long previous_freq; struct devfreq_dev_status last_status; -- cgit From 1758bde2e4aa5ff188d53e7d9d388bbb7e12eebb Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 28 Jun 2022 12:15:08 +0200 Subject: net: phy: Don't trigger state machine while in suspend Upon system sleep, mdio_bus_phy_suspend() stops the phy_state_machine(), but subsequent interrupts may retrigger it: They may have been left enabled to facilitate wakeup and are not quiesced until the ->suspend_noirq() phase. Unwanted interrupts may hence occur between mdio_bus_phy_suspend() and dpm_suspend_noirq(), as well as between dpm_resume_noirq() and mdio_bus_phy_resume(). Retriggering the phy_state_machine() through an interrupt is not only undesirable for the reason given in mdio_bus_phy_suspend() (freezing it midway with phydev->lock held), but also because the PHY may be inaccessible after it's suspended: Accesses to USB-attached PHYs are blocked once usb_suspend_both() clears the can_submit flag and PHYs on PCI network cards may become inaccessible upon suspend as well. Amend phy_interrupt() to avoid triggering the state machine if the PHY is suspended. Signal wakeup instead if the attached net_device or its parent has been configured as a wakeup source. (Those conditions are identical to mdio_bus_phy_may_suspend().) Postpone handling of the interrupt until the PHY has resumed. Before stopping the phy_state_machine() in mdio_bus_phy_suspend(), wait for a concurrent phy_interrupt() to run to completion. That is necessary because phy_interrupt() may have checked the PHY's suspend status before the system sleep transition commenced and it may thus retrigger the state machine after it was stopped. Likewise, after re-enabling interrupt handling in mdio_bus_phy_resume(), wait for a concurrent phy_interrupt() to complete to ensure that interrupts which it postponed are properly rerun. The issue was exposed by commit 1ce8b37241ed ("usbnet: smsc95xx: Forward PHY interrupts to PHY driver to avoid polling"), but has existed since forever. Fixes: 541cd3ee00a4 ("phylib: Fix deadlock on resume") Link: https://lore.kernel.org/netdev/a5315a8a-32c2-962f-f696-de9a26d30091@samsung.com/ Reported-by: Marek Szyprowski Tested-by: Marek Szyprowski Signed-off-by: Lukas Wunner Acked-by: Rafael J. Wysocki Cc: stable@vger.kernel.org # v2.6.33+ Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/b7f386d04e9b5b0e2738f0125743e30676f309ef.1656410895.git.lukas@wunner.de Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy.c | 23 +++++++++++++++++++++++ drivers/net/phy/phy_device.c | 23 +++++++++++++++++++++++ include/linux/phy.h | 6 ++++++ 3 files changed, 52 insertions(+) (limited to 'include/linux') diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index ef62f357b76d..8d3ee3a6495b 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -976,6 +977,28 @@ static irqreturn_t phy_interrupt(int irq, void *phy_dat) struct phy_driver *drv = phydev->drv; irqreturn_t ret; + /* Wakeup interrupts may occur during a system sleep transition. + * Postpone handling until the PHY has resumed. + */ + if (IS_ENABLED(CONFIG_PM_SLEEP) && phydev->irq_suspended) { + struct net_device *netdev = phydev->attached_dev; + + if (netdev) { + struct device *parent = netdev->dev.parent; + + if (netdev->wol_enabled) + pm_system_wakeup(); + else if (device_may_wakeup(&netdev->dev)) + pm_wakeup_dev_event(&netdev->dev, 0, true); + else if (parent && device_may_wakeup(parent)) + pm_wakeup_dev_event(parent, 0, true); + } + + phydev->irq_rerun = 1; + disable_irq_nosync(irq); + return IRQ_HANDLED; + } + mutex_lock(&phydev->lock); ret = drv->handle_interrupt(phydev); mutex_unlock(&phydev->lock); diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 431a8719c635..46acddd865a7 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -278,6 +278,15 @@ static __maybe_unused int mdio_bus_phy_suspend(struct device *dev) if (phydev->mac_managed_pm) return 0; + /* Wakeup interrupts may occur during the system sleep transition when + * the PHY is inaccessible. Set flag to postpone handling until the PHY + * has resumed. Wait for concurrent interrupt handler to complete. + */ + if (phy_interrupt_is_valid(phydev)) { + phydev->irq_suspended = 1; + synchronize_irq(phydev->irq); + } + /* We must stop the state machine manually, otherwise it stops out of * control, possibly with the phydev->lock held. Upon resume, netdev * may call phy routines that try to grab the same lock, and that may @@ -315,6 +324,20 @@ static __maybe_unused int mdio_bus_phy_resume(struct device *dev) if (ret < 0) return ret; no_resume: + if (phy_interrupt_is_valid(phydev)) { + phydev->irq_suspended = 0; + synchronize_irq(phydev->irq); + + /* Rerun interrupts which were postponed by phy_interrupt() + * because they occurred during the system sleep transition. + */ + if (phydev->irq_rerun) { + phydev->irq_rerun = 0; + enable_irq(phydev->irq); + irq_wake_thread(phydev->irq, phydev); + } + } + if (phydev->attached_dev && phydev->adjust_link) phy_start_machine(phydev); diff --git a/include/linux/phy.h b/include/linux/phy.h index 508f1149665b..b09f7d36cff2 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -572,6 +572,10 @@ struct macsec_ops; * @mdix_ctrl: User setting of crossover * @pma_extable: Cached value of PMA/PMD Extended Abilities Register * @interrupts: Flag interrupts have been enabled + * @irq_suspended: Flag indicating PHY is suspended and therefore interrupt + * handling shall be postponed until PHY has resumed + * @irq_rerun: Flag indicating interrupts occurred while PHY was suspended, + * requiring a rerun of the interrupt handler after resume * @interface: enum phy_interface_t value * @skb: Netlink message for cable diagnostics * @nest: Netlink nest used for cable diagnostics @@ -626,6 +630,8 @@ struct phy_device { /* Interrupts are enabled */ unsigned interrupts:1; + unsigned irq_suspended:1; + unsigned irq_rerun:1; enum phy_state state; -- cgit From f9b11229b79c0fb2100b5bb4628a101b1d37fbf6 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Wed, 29 Jun 2022 12:48:41 +0300 Subject: serial: 8250: Fix PM usage_count for console handover MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When console is enabled, univ8250_console_setup() calls serial8250_console_setup() before .dev is set to uart_port. Therefore, it will not call pm_runtime_get_sync(). Later, when the actual driver is going to take over univ8250_console_exit() is called. As .dev is already set, serial8250_console_exit() makes pm_runtime_put_sync() call with usage count being zero triggering PM usage count warning (extra debug for univ8250_console_setup(), univ8250_console_exit(), and serial8250_register_ports()): [ 0.068987] univ8250_console_setup ttyS0 nodev [ 0.499670] printk: console [ttyS0] enabled [ 0.717955] printk: console [ttyS0] printing thread started [ 1.960163] serial8250_register_ports assigned dev for ttyS0 [ 1.976830] printk: console [ttyS0] disabled [ 1.976888] printk: console [ttyS0] printing thread stopped [ 1.977073] univ8250_console_exit ttyS0 usage:0 [ 1.977075] serial8250 serial8250: Runtime PM usage count underflow! [ 1.977429] dw-apb-uart.6: ttyS0 at MMIO 0x4010006000 (irq = 33, base_baud = 115200) is a 16550A [ 1.977812] univ8250_console_setup ttyS0 usage:2 [ 1.978167] printk: console [ttyS0] printing thread started [ 1.978203] printk: console [ttyS0] enabled To fix the issue, call pm_runtime_get_sync() in serial8250_register_ports() as soon as .dev is set for an uart_port if it has console enabled. This problem became apparent only recently because 82586a721595 ("PM: runtime: Avoid device usage count underflows") added the warning printout. I confirmed this problem also occurs with v5.18 (w/o the warning printout, obviously). Fixes: bedb404e91bb ("serial: 8250_port: Don't use power management for kernel console") Cc: stable Tested-by: Tony Lindgren Reviewed-by: Andy Shevchenko Reviewed-by: Tony Lindgren Signed-off-by: Ilpo Järvinen Link: https://lore.kernel.org/r/b4f428e9-491f-daf2-2232-819928dc276e@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_core.c | 4 ++++ drivers/tty/serial/serial_core.c | 5 ----- include/linux/serial_core.h | 5 +++++ 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c index cfbd2de0ca6e..3f56dbc9432b 100644 --- a/drivers/tty/serial/8250/8250_core.c +++ b/drivers/tty/serial/8250/8250_core.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -559,6 +560,9 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev) up->port.dev = dev; + if (uart_console_enabled(&up->port)) + pm_runtime_get_sync(up->port.dev); + serial8250_apply_quirks(up); uart_add_one_port(drv, &up->port); } diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 338ebadfd44b..3dc926d6c00a 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -1941,11 +1941,6 @@ static int uart_proc_show(struct seq_file *m, void *v) } #endif -static inline bool uart_console_enabled(struct uart_port *port) -{ - return uart_console(port) && (port->cons->flags & CON_ENABLED); -} - static void uart_port_spin_lock_init(struct uart_port *port) { spin_lock_init(&port->lock); diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 657a0fc68a3f..fde258b3decd 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -390,6 +390,11 @@ static const bool earlycon_acpi_spcr_enable EARLYCON_USED_OR_UNUSED; static inline int setup_earlycon(char *buf) { return 0; } #endif +static inline bool uart_console_enabled(struct uart_port *port) +{ + return uart_console(port) && (port->cons->flags & CON_ENABLED); +} + struct uart_port *uart_get_console(struct uart_port *ports, int nr, struct console *c); int uart_parse_earlycon(char *p, unsigned char *iotype, resource_size_t *addr, -- cgit From eb7f8e28420372787933eec079735c35034bda7d Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 30 Jun 2022 20:32:55 -0600 Subject: misc: rtsx_usb: fix use of dma mapped buffer for usb bulk transfer rtsx_usb driver allocates coherent dma buffer for urb transfers. This buffer is passed to usb_bulk_msg() and usb core tries to map already mapped buffer running into a dma mapping error. xhci_hcd 0000:01:00.0: rejecting DMA map of vmalloc memory WARNING: CPU: 1 PID: 279 at include/linux/dma-mapping.h:326 usb_ hcd_map_urb_for_dma+0x7d6/0x820 ... xhci_map_urb_for_dma+0x291/0x4e0 usb_hcd_submit_urb+0x199/0x12b0 ... usb_submit_urb+0x3b8/0x9e0 usb_start_wait_urb+0xe3/0x2d0 usb_bulk_msg+0x115/0x240 rtsx_usb_transfer_data+0x185/0x1a8 [rtsx_usb] rtsx_usb_send_cmd+0xbb/0x123 [rtsx_usb] rtsx_usb_write_register+0x12c/0x143 [rtsx_usb] rtsx_usb_probe+0x226/0x4b2 [rtsx_usb] Fix it to use kmalloc() to get DMA-able memory region instead. Signed-off-by: Shuah Khan Cc: stable Link: https://lore.kernel.org/r/667d627d502e1ba9ff4f9b94966df3299d2d3c0d.1656642167.git.skhan@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cardreader/rtsx_usb.c | 13 +++++++------ include/linux/rtsx_usb.h | 1 - 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/drivers/misc/cardreader/rtsx_usb.c b/drivers/misc/cardreader/rtsx_usb.c index 1ef9b61077c4..e147cc8ab0fd 100644 --- a/drivers/misc/cardreader/rtsx_usb.c +++ b/drivers/misc/cardreader/rtsx_usb.c @@ -631,8 +631,7 @@ static int rtsx_usb_probe(struct usb_interface *intf, ucr->pusb_dev = usb_dev; - ucr->iobuf = usb_alloc_coherent(ucr->pusb_dev, IOBUF_SIZE, - GFP_KERNEL, &ucr->iobuf_dma); + ucr->iobuf = kmalloc(IOBUF_SIZE, GFP_KERNEL); if (!ucr->iobuf) return -ENOMEM; @@ -668,8 +667,9 @@ static int rtsx_usb_probe(struct usb_interface *intf, out_init_fail: usb_set_intfdata(ucr->pusb_intf, NULL); - usb_free_coherent(ucr->pusb_dev, IOBUF_SIZE, ucr->iobuf, - ucr->iobuf_dma); + kfree(ucr->iobuf); + ucr->iobuf = NULL; + ucr->cmd_buf = ucr->rsp_buf = NULL; return ret; } @@ -682,8 +682,9 @@ static void rtsx_usb_disconnect(struct usb_interface *intf) mfd_remove_devices(&intf->dev); usb_set_intfdata(ucr->pusb_intf, NULL); - usb_free_coherent(ucr->pusb_dev, IOBUF_SIZE, ucr->iobuf, - ucr->iobuf_dma); + kfree(ucr->iobuf); + ucr->iobuf = NULL; + ucr->cmd_buf = ucr->rsp_buf = NULL; } #ifdef CONFIG_PM diff --git a/include/linux/rtsx_usb.h b/include/linux/rtsx_usb.h index 159729cffd8e..a07f7341ebc2 100644 --- a/include/linux/rtsx_usb.h +++ b/include/linux/rtsx_usb.h @@ -55,7 +55,6 @@ struct rtsx_ucr { struct usb_interface *pusb_intf; struct usb_sg_request current_sg; unsigned char *iobuf; - dma_addr_t iobuf_dma; struct timer_list sg_timer; struct mutex dev_mutex; -- cgit From 3776c78559853fd151be7c41e369fd076fb679d5 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Thu, 30 Jun 2022 20:32:56 -0600 Subject: misc: rtsx_usb: use separate command and response buffers rtsx_usb uses same buffer for command and response. There could be a potential conflict using the same buffer for both especially if retries and timeouts are involved. Use separate command and response buffers to avoid conflicts. Signed-off-by: Shuah Khan Cc: stable Link: https://lore.kernel.org/r/07e3721804ff07aaab9ef5b39a5691d0718b9ade.1656642167.git.skhan@linuxfoundation.org Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cardreader/rtsx_usb.c | 26 +++++++++++++++++--------- include/linux/rtsx_usb.h | 1 - 2 files changed, 17 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/misc/cardreader/rtsx_usb.c b/drivers/misc/cardreader/rtsx_usb.c index e147cc8ab0fd..4e2108052509 100644 --- a/drivers/misc/cardreader/rtsx_usb.c +++ b/drivers/misc/cardreader/rtsx_usb.c @@ -631,15 +631,18 @@ static int rtsx_usb_probe(struct usb_interface *intf, ucr->pusb_dev = usb_dev; - ucr->iobuf = kmalloc(IOBUF_SIZE, GFP_KERNEL); - if (!ucr->iobuf) + ucr->cmd_buf = kmalloc(IOBUF_SIZE, GFP_KERNEL); + if (!ucr->cmd_buf) return -ENOMEM; + ucr->rsp_buf = kmalloc(IOBUF_SIZE, GFP_KERNEL); + if (!ucr->rsp_buf) + goto out_free_cmd_buf; + usb_set_intfdata(intf, ucr); ucr->vendor_id = id->idVendor; ucr->product_id = id->idProduct; - ucr->cmd_buf = ucr->rsp_buf = ucr->iobuf; mutex_init(&ucr->dev_mutex); @@ -667,9 +670,11 @@ static int rtsx_usb_probe(struct usb_interface *intf, out_init_fail: usb_set_intfdata(ucr->pusb_intf, NULL); - kfree(ucr->iobuf); - ucr->iobuf = NULL; - ucr->cmd_buf = ucr->rsp_buf = NULL; + kfree(ucr->rsp_buf); + ucr->rsp_buf = NULL; +out_free_cmd_buf: + kfree(ucr->cmd_buf); + ucr->cmd_buf = NULL; return ret; } @@ -682,9 +687,12 @@ static void rtsx_usb_disconnect(struct usb_interface *intf) mfd_remove_devices(&intf->dev); usb_set_intfdata(ucr->pusb_intf, NULL); - kfree(ucr->iobuf); - ucr->iobuf = NULL; - ucr->cmd_buf = ucr->rsp_buf = NULL; + + kfree(ucr->cmd_buf); + ucr->cmd_buf = NULL; + + kfree(ucr->rsp_buf); + ucr->rsp_buf = NULL; } #ifdef CONFIG_PM diff --git a/include/linux/rtsx_usb.h b/include/linux/rtsx_usb.h index a07f7341ebc2..3247ed8e9ff0 100644 --- a/include/linux/rtsx_usb.h +++ b/include/linux/rtsx_usb.h @@ -54,7 +54,6 @@ struct rtsx_ucr { struct usb_device *pusb_dev; struct usb_interface *pusb_intf; struct usb_sg_request current_sg; - unsigned char *iobuf; struct timer_list sg_timer; struct mutex dev_mutex; -- cgit From 07358194badf73e267289b40b761f5dc56928eab Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 27 Jun 2022 20:42:18 +0200 Subject: PM: runtime: Redefine pm_runtime_release_supplier() Instead of passing an extra bool argument to pm_runtime_release_supplier(), make its callers take care of triggering a runtime-suspend of the supplier device as needed. No expected functional impact. Suggested-by: Greg Kroah-Hartman Signed-off-by: Rafael J. Wysocki Reviewed-by: Greg Kroah-Hartman Cc: 5.1+ # 5.1+ --- drivers/base/core.c | 3 ++- drivers/base/power/runtime.c | 20 +++++++++----------- include/linux/pm_runtime.h | 5 ++--- 3 files changed, 13 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/core.c b/drivers/base/core.c index 7cd789c4985d..58aa49527d3a 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -486,7 +486,8 @@ static void device_link_release_fn(struct work_struct *work) /* Ensure that all references to the link object have been dropped. */ device_link_synchronize_removal(); - pm_runtime_release_supplier(link, true); + pm_runtime_release_supplier(link); + pm_request_idle(link->supplier); put_device(link->consumer); put_device(link->supplier); diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 676dc72d912d..23cc4c377d77 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -308,13 +308,10 @@ static int rpm_get_suppliers(struct device *dev) /** * pm_runtime_release_supplier - Drop references to device link's supplier. * @link: Target device link. - * @check_idle: Whether or not to check if the supplier device is idle. * - * Drop all runtime PM references associated with @link to its supplier device - * and if @check_idle is set, check if that device is idle (and so it can be - * suspended). + * Drop all runtime PM references associated with @link to its supplier device. */ -void pm_runtime_release_supplier(struct device_link *link, bool check_idle) +void pm_runtime_release_supplier(struct device_link *link) { struct device *supplier = link->supplier; @@ -327,9 +324,6 @@ void pm_runtime_release_supplier(struct device_link *link, bool check_idle) while (refcount_dec_not_one(&link->rpm_active) && atomic_read(&supplier->power.usage_count) > 0) pm_runtime_put_noidle(supplier); - - if (check_idle) - pm_request_idle(supplier); } static void __rpm_put_suppliers(struct device *dev, bool try_to_suspend) @@ -337,8 +331,11 @@ static void __rpm_put_suppliers(struct device *dev, bool try_to_suspend) struct device_link *link; list_for_each_entry_rcu(link, &dev->links.suppliers, c_node, - device_links_read_lock_held()) - pm_runtime_release_supplier(link, try_to_suspend); + device_links_read_lock_held()) { + pm_runtime_release_supplier(link); + if (try_to_suspend) + pm_request_idle(link->supplier); + } } static void rpm_put_suppliers(struct device *dev) @@ -1838,7 +1835,8 @@ void pm_runtime_drop_link(struct device_link *link) return; pm_runtime_drop_link_count(link->consumer); - pm_runtime_release_supplier(link, true); + pm_runtime_release_supplier(link); + pm_request_idle(link->supplier); } static bool pm_runtime_need_not_resume(struct device *dev) diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 9e4d056967c6..0a41b2dcccad 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -88,7 +88,7 @@ extern void pm_runtime_get_suppliers(struct device *dev); extern void pm_runtime_put_suppliers(struct device *dev); extern void pm_runtime_new_link(struct device *dev); extern void pm_runtime_drop_link(struct device_link *link); -extern void pm_runtime_release_supplier(struct device_link *link, bool check_idle); +extern void pm_runtime_release_supplier(struct device_link *link); extern int devm_pm_runtime_enable(struct device *dev); @@ -314,8 +314,7 @@ static inline void pm_runtime_get_suppliers(struct device *dev) {} static inline void pm_runtime_put_suppliers(struct device *dev) {} static inline void pm_runtime_new_link(struct device *dev) {} static inline void pm_runtime_drop_link(struct device_link *link) {} -static inline void pm_runtime_release_supplier(struct device_link *link, - bool check_idle) {} +static inline void pm_runtime_release_supplier(struct device_link *link) {} #endif /* !CONFIG_PM */ -- cgit From 4a557a5d1a6145ea586dc9b17a9b4e5190c9c017 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 30 Jun 2022 09:34:10 -0700 Subject: sparse: introduce conditional lock acquire function attribute The kernel tends to try to avoid conditional locking semantics because it makes it harder to think about and statically check locking rules, but we do have a few fundamental locking primitives that take locks conditionally - most obviously the 'trylock' functions. That has always been a problem for 'sparse' checking for locking imbalance, and we've had a special '__cond_lock()' macro that we've used to let sparse know how the locking works: # define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) so that you can then use this to tell sparse that (for example) the spinlock trylock macro ends up acquiring the lock when it succeeds, but not when it fails: #define raw_spin_trylock(lock) __cond_lock(lock, _raw_spin_trylock(lock)) and then sparse can follow along the locking rules when you have code like if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; .. sparse sees that the lock is held here.. spin_unlock(&dentry->d_lock); and sparse ends up happy about the lock contexts. However, this '__cond_lock()' use does result in very ugly header files, and requires you to basically wrap the real function with that macro that uses '__cond_lock'. Which has made PeterZ NAK things that try to fix sparse warnings over the years [1]. To solve this, there is now a very experimental patch to sparse that basically does the exact same thing as '__cond_lock()' did, but using a function attribute instead. That seems to make PeterZ happy [2]. Note that this does not replace existing use of '__cond_lock()', but only exposes the new proposed attribute and uses it for the previously unannotated 'refcount_dec_and_lock()' family of functions. For existing sparse installations, this will make no difference (a negative output context was ignored), but if you have the experimental sparse patch it will make sparse now understand code that uses those functions, the same way '__cond_lock()' makes sparse understand the very similar 'atomic_dec_and_lock()' uses that have the old '__cond_lock()' annotations. Note that in some cases this will silence existing context imbalance warnings. But in other cases it may end up exposing new sparse warnings for code that sparse just didn't see the locking for at all before. This is a trial, in other words. I'd expect that if it ends up being successful, and new sparse releases end up having this new attribute, we'll migrate the old-style '__cond_lock()' users to use the new-style '__cond_acquires' function attribute. The actual experimental sparse patch was posted in [3]. Link: https://lore.kernel.org/all/20130930134434.GC12926@twins.programming.kicks-ass.net/ [1] Link: https://lore.kernel.org/all/Yr60tWxN4P568x3W@worktop.programming.kicks-ass.net/ [2] Link: https://lore.kernel.org/all/CAHk-=wjZfO9hGqJ2_hGQG3U_XzSh9_XaXze=HgPdvJbgrvASfA@mail.gmail.com/ [3] Acked-by: Peter Zijlstra Cc: Alexander Aring Cc: Luc Van Oostenryck Signed-off-by: Linus Torvalds --- include/linux/compiler_types.h | 2 ++ include/linux/refcount.h | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index d08dfcb0ac68..4f2a819fd60a 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -24,6 +24,7 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } /* context/locking */ # define __must_hold(x) __attribute__((context(x,1,1))) # define __acquires(x) __attribute__((context(x,0,1))) +# define __cond_acquires(x) __attribute__((context(x,0,-1))) # define __releases(x) __attribute__((context(x,1,0))) # define __acquire(x) __context__(x,1) # define __release(x) __context__(x,-1) @@ -50,6 +51,7 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } /* context/locking */ # define __must_hold(x) # define __acquires(x) +# define __cond_acquires(x) # define __releases(x) # define __acquire(x) (void)0 # define __release(x) (void)0 diff --git a/include/linux/refcount.h b/include/linux/refcount.h index b8a6e387f8f9..a62fcca97486 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -361,9 +361,9 @@ static inline void refcount_dec(refcount_t *r) extern __must_check bool refcount_dec_if_one(refcount_t *r); extern __must_check bool refcount_dec_not_one(refcount_t *r); -extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock); -extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock); +extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) __cond_acquires(lock); +extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) __cond_acquires(lock); extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r, spinlock_t *lock, - unsigned long *flags); + unsigned long *flags) __cond_acquires(lock); #endif /* _LINUX_REFCOUNT_H */ -- cgit From b8d5109f50969ead9d49c3e8bd78ec1f82e548e3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 3 Jul 2022 14:40:28 -0700 Subject: lockref: remove unused 'lockref_get_or_lock()' function Looking at the conditional lock acquire functions in the kernel due to the new sparse support (see commit 4a557a5d1a61 "sparse: introduce conditional lock acquire function attribute"), it became obvious that the lockref code has a couple of them, but they don't match the usual naming convention for the other ones, and their return value logic is also reversed. In the other very similar places, the naming pattern is '*_and_lock()' (eg 'atomic_put_and_lock()' and 'refcount_dec_and_lock()'), and the function returns true when the lock is taken. The lockref code is superficially very similar to the refcount code, only with the special "atomic wrt the embedded lock" semantics. But instead of the '*_and_lock()' naming it uses '*_or_lock()'. And instead of returning true in case it took the lock, it returns true if it *didn't* take the lock. Now, arguably the reflock code is quite logical: it really is a "either decrement _or_ lock" kind of situation - and the return value is about whether the operation succeeded without any special care needed. So despite the similarities, the differences do make some sense, and maybe it's not worth trying to unify the different conditional locking primitives in this area. But while looking at this all, it did become obvious that the 'lockref_get_or_lock()' function hasn't actually had any users for almost a decade. The only user it ever had was the shortlived 'd_rcu_to_refcount()' function, and it got removed and replaced with 'lockref_get_not_dead()' back in 2013 in commits 0d98439ea3c6 ("vfs: use lockred 'dead' flag to mark unrecoverably dead dentries") and e5c832d55588 ("vfs: fix dentry RCU to refcounting possibly sleeping dput()") In fact, that single use was removed less than a week after the whole function was introduced in commit b3abd80250c1 ("lockref: add 'lockref_get_or_lock() helper") so this function has been around for a decade, but only had a user for six days. Let's just put this mis-designed and unused function out of its misery. We can think about the naming and semantic oddities of the remaining 'lockref_put_or_lock()' later, but at least that function has users. And while the naming is different and the return value doesn't match, that function matches the whole '{atomic,refcount}_dec_and_test()' pattern much better (ie the magic happens when the count goes down to zero, not when it is incremented from zero). Signed-off-by: Linus Torvalds --- include/linux/lockref.h | 1 - lib/lockref.c | 25 ------------------------- 2 files changed, 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockref.h b/include/linux/lockref.h index 99f17cc8e163..c3a1f78bc884 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -38,7 +38,6 @@ extern void lockref_get(struct lockref *); extern int lockref_put_return(struct lockref *); extern int lockref_get_not_zero(struct lockref *); extern int lockref_put_not_zero(struct lockref *); -extern int lockref_get_or_lock(struct lockref *); extern int lockref_put_or_lock(struct lockref *); extern void lockref_mark_dead(struct lockref *); diff --git a/lib/lockref.c b/lib/lockref.c index c6f0b183b937..45e93ece8ba0 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -110,31 +110,6 @@ int lockref_put_not_zero(struct lockref *lockref) } EXPORT_SYMBOL(lockref_put_not_zero); -/** - * lockref_get_or_lock - Increments count unless the count is 0 or dead - * @lockref: pointer to lockref structure - * Return: 1 if count updated successfully or 0 if count was zero - * and we got the lock instead. - */ -int lockref_get_or_lock(struct lockref *lockref) -{ - CMPXCHG_LOOP( - new.count++; - if (old.count <= 0) - break; - , - return 1; - ); - - spin_lock(&lockref->lock); - if (lockref->count <= 0) - return 0; - lockref->count++; - spin_unlock(&lockref->lock); - return 1; -} -EXPORT_SYMBOL(lockref_get_or_lock); - /** * lockref_put_return - Decrement reference count if possible * @lockref: pointer to lockref structure -- cgit From cffe57bee62b155c08d71218fc0e9e84a0a90bbb Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Wed, 22 Jun 2022 15:45:46 +0700 Subject: Documentation: highmem: use literal block for code example in highmem.h comment When building htmldocs on Linus's tree, there are inline emphasis warnings on include/linux/highmem.h: Documentation/vm/highmem:166: ./include/linux/highmem.h:154: WARNING: Inline emphasis start-string without end-string. Documentation/vm/highmem:166: ./include/linux/highmem.h:157: WARNING: Inline emphasis start-string without end-string. These warnings above are due to comments in code example at the mentioned lines above are enclosed by double dash (--), which confuses Sphinx as inline markup delimiters instead. Fix these warnings by indenting the code example with literal block indentation and making the comments C comments. Link: https://lkml.kernel.org/r/20220622084546.17745-1-bagasdotme@gmail.com Fixes: 85a85e7601263f ("Documentation/vm: move "Using kmap-atomic" to highmem.h") Signed-off-by: Bagas Sanjaya Reviewed-by: Ira Weiny Tested-by: Ira Weiny Cc: "Matthew Wilcox (Oracle)" Cc: "Fabio M. De Francesco" Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/highmem.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 3af34de54330..56d6a0196534 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -149,19 +149,19 @@ static inline void *kmap_local_folio(struct folio *folio, size_t offset); * It is used in atomic context when code wants to access the contents of a * page that might be allocated from high memory (see __GFP_HIGHMEM), for * example a page in the pagecache. The API has two functions, and they - * can be used in a manner similar to the following: + * can be used in a manner similar to the following:: * - * -- Find the page of interest. -- - * struct page *page = find_get_page(mapping, offset); + * // Find the page of interest. + * struct page *page = find_get_page(mapping, offset); * - * -- Gain access to the contents of that page. -- - * void *vaddr = kmap_atomic(page); + * // Gain access to the contents of that page. + * void *vaddr = kmap_atomic(page); * - * -- Do something to the contents of that page. -- - * memset(vaddr, 0, PAGE_SIZE); + * // Do something to the contents of that page. + * memset(vaddr, 0, PAGE_SIZE); * - * -- Unmap that page. -- - * kunmap_atomic(vaddr); + * // Unmap that page. + * kunmap_atomic(vaddr); * * Note that the kunmap_atomic() call takes the result of the kmap_atomic() * call, not the argument. -- cgit From 85e4ea1049c70fb99de5c6057e835d151fb647da Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 20 Apr 2022 14:27:17 +0100 Subject: fscache: Fix invalidation/lookup race If an NFS file is opened for writing and closed, fscache_invalidate() will be asked to invalidate the file - however, if the cookie is in the LOOKING_UP state (or the CREATING state), then request to invalidate doesn't get recorded for fscache_cookie_state_machine() to do something with. Fix this by making __fscache_invalidate() set a flag if it sees the cookie is in the LOOKING_UP state to indicate that we need to go to invalidation. Note that this requires a count on the n_accesses counter for the state machine, which that will release when it's done. fscache_cookie_state_machine() then shifts to the INVALIDATING state if it sees the flag. Without this, an nfs file can get corrupted if it gets modified locally and then read locally as the cache contents may not get updated. Fixes: d24af13e2e23 ("fscache: Implement cookie invalidation") Reported-by: Max Kellermann Signed-off-by: David Howells Tested-by: Max Kellermann Link: https://lore.kernel.org/r/YlWWbpW5Foynjllo@rabbit.intern.cm-ag [1] --- fs/fscache/cookie.c | 15 ++++++++++++++- include/linux/fscache.h | 1 + 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 575a5ba07c0d..74920826d8f6 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -522,7 +522,14 @@ static void fscache_perform_lookup(struct fscache_cookie *cookie) } fscache_see_cookie(cookie, fscache_cookie_see_active); - fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE); + spin_lock(&cookie->lock); + if (test_and_clear_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags)) + __fscache_set_cookie_state(cookie, + FSCACHE_COOKIE_STATE_INVALIDATING); + else + __fscache_set_cookie_state(cookie, FSCACHE_COOKIE_STATE_ACTIVE); + spin_unlock(&cookie->lock); + wake_up_cookie_state(cookie); trace = fscache_access_lookup_cookie_end; out: @@ -757,6 +764,9 @@ again_locked: spin_lock(&cookie->lock); } + if (test_and_clear_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags)) + fscache_end_cookie_access(cookie, fscache_access_invalidate_cookie_end); + switch (state) { case FSCACHE_COOKIE_STATE_RELINQUISHING: fscache_see_cookie(cookie, fscache_cookie_see_relinquish); @@ -1053,6 +1063,9 @@ void __fscache_invalidate(struct fscache_cookie *cookie, return; case FSCACHE_COOKIE_STATE_LOOKING_UP: + __fscache_begin_cookie_access(cookie, fscache_access_invalidate_cookie); + set_bit(FSCACHE_COOKIE_DO_INVALIDATE, &cookie->flags); + fallthrough; case FSCACHE_COOKIE_STATE_CREATING: spin_unlock(&cookie->lock); _leave(" [look %x]", cookie->inval_counter); diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 72585c9729a2..b86265664879 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -130,6 +130,7 @@ struct fscache_cookie { #define FSCACHE_COOKIE_DO_PREP_TO_WRITE 12 /* T if cookie needs write preparation */ #define FSCACHE_COOKIE_HAVE_DATA 13 /* T if this cookie has data stored */ #define FSCACHE_COOKIE_IS_HASHED 14 /* T if this cookie is hashed */ +#define FSCACHE_COOKIE_DO_INVALIDATE 15 /* T if cookie needs invalidation */ enum fscache_cookie_state state; u8 advice; /* FSCACHE_ADV_* */ -- cgit From 7feec7430edddb87c24b0a86b08a03d0b496a755 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 5 Jul 2022 13:29:14 -0500 Subject: ACPI: CPPC: Only probe for _CPC if CPPC v2 is acked Previously the kernel used to ignore whether the firmware masked CPPC or CPPCv2 and would just pretend that it worked. When support for the USB4 bit in _OSC was introduced from commit 9e1f561afb ("ACPI: Execute platform _OSC also with query bit clear") the kernel began to look at the return when the query bit was clear. This caused regressions that were misdiagnosed and attempted to be solved as part of commit 2ca8e6285250 ("Revert "ACPI: Pass the same capabilities to the _OSC regardless of the query flag""). This caused a different regression where non-Intel systems weren't able to negotiate _OSC properly. This was reverted in commit 2ca8e6285250 ("Revert "ACPI: Pass the same capabilities to the _OSC regardless of the query flag"") and attempted to be fixed by commit c42fa24b4475 ("ACPI: bus: Avoid using CPPC if not supported by firmware") but the regression still returned. These systems with the regression only load support for CPPC from an SSDT dynamically when _OSC reports CPPC v2. Avoid the problem by not letting CPPC satisfy the requirement in `acpi_cppc_processor_probe`. Reported-by: CUI Hao Reported-by: maxim.novozhilov@gmail.com Reported-by: lethe.tree@protonmail.com Reported-by: garystephenwright@gmail.com Reported-by: galaxyking0419@gmail.com Fixes: c42fa24b4475 ("ACPI: bus: Avoid using CPPC if not supported by firmware") Fixes: 2ca8e6285250 ("Revert "ACPI Pass the same capabilities to the _OSC regardless of the query flag"") Link: https://bugzilla.kernel.org/show_bug.cgi?id=213023 Link: https://bugzilla.redhat.com/show_bug.cgi?id=2075387 Reviewed-by: Mika Westerberg Tested-by: CUI Hao Signed-off-by: Mario Limonciello Signed-off-by: Rafael J. Wysocki --- drivers/acpi/bus.c | 11 +++++------ drivers/acpi/cppc_acpi.c | 4 +++- include/linux/acpi.h | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 86fa61a21826..e2db1bdd9dd2 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -298,7 +298,7 @@ EXPORT_SYMBOL_GPL(osc_cpc_flexible_adr_space_confirmed); bool osc_sb_native_usb4_support_confirmed; EXPORT_SYMBOL_GPL(osc_sb_native_usb4_support_confirmed); -bool osc_sb_cppc_not_supported; +bool osc_sb_cppc2_support_acked; static u8 sb_uuid_str[] = "0811B06E-4A27-44F9-8D60-3CBBC22E7B48"; static void acpi_bus_osc_negotiate_platform_control(void) @@ -358,11 +358,6 @@ static void acpi_bus_osc_negotiate_platform_control(void) return; } -#ifdef CONFIG_ACPI_CPPC_LIB - osc_sb_cppc_not_supported = !(capbuf_ret[OSC_SUPPORT_DWORD] & - (OSC_SB_CPC_SUPPORT | OSC_SB_CPCV2_SUPPORT)); -#endif - /* * Now run _OSC again with query flag clear and with the caps * supported by both the OS and the platform. @@ -376,6 +371,10 @@ static void acpi_bus_osc_negotiate_platform_control(void) capbuf_ret = context.ret.pointer; if (context.ret.length > OSC_SUPPORT_DWORD) { +#ifdef CONFIG_ACPI_CPPC_LIB + osc_sb_cppc2_support_acked = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_CPCV2_SUPPORT; +#endif + osc_sb_apei_support_acked = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_APEI_SUPPORT; osc_pc_lpi_support_confirmed = diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 903528f7e187..d64facbda0fb 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -684,8 +684,10 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr) acpi_status status; int ret = -ENODATA; - if (osc_sb_cppc_not_supported) + if (!osc_sb_cppc2_support_acked) { + pr_debug("CPPC v2 _OSC not acked\n"); return -ENODEV; + } /* Parse the ACPI _CPC table for this CPU. */ status = acpi_evaluate_object_typed(handle, "_CPC", NULL, &output, diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4f82a5bc6d98..44975c1bbe12 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -584,7 +584,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context); extern bool osc_sb_apei_support_acked; extern bool osc_pc_lpi_support_confirmed; extern bool osc_sb_native_usb4_support_confirmed; -extern bool osc_sb_cppc_not_supported; +extern bool osc_sb_cppc2_support_acked; extern bool osc_cpc_flexible_adr_space_confirmed; /* USB4 Capabilities */ -- cgit From 4140d77a022101376bbfa3ec3e3da5063455c60e Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Sat, 25 Jun 2022 21:34:30 +0800 Subject: iommu/vt-d: Fix RID2PASID setup/teardown failure The IOMMU driver shares the pasid table for PCI alias devices. When the RID2PASID entry of the shared pasid table has been filled by the first device, the subsequent device will encounter the "DMAR: Setup RID2PASID failed" failure as the pasid entry has already been marked as present. As the result, the IOMMU probing process will be aborted. On the contrary, when any alias device is hot-removed from the system, for example, by writing to /sys/bus/pci/devices/.../remove, the shared RID2PASID will be cleared without any notifications to other devices. As the result, any DMAs from those rest devices are blocked. Sharing pasid table among PCI alias devices could save two memory pages for devices underneath the PCIe-to-PCI bridges. Anyway, considering that those devices are rare on modern platforms that support VT-d in scalable mode and the saved memory is negligible, it's reasonable to remove this part of immature code to make the driver feasible and stable. Fixes: ef848b7e5a6a0 ("iommu/vt-d: Setup pasid entry for RID2PASID support") Reported-by: Chenyi Qiang Reported-by: Ethan Zhao Signed-off-by: Lu Baolu Reviewed-by: Kevin Tian Reviewed-by: Ethan Zhao Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220623065720.727849-1-baolu.lu@linux.intel.com Link: https://lore.kernel.org/r/20220625133430.2200315-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 24 ---------------- drivers/iommu/intel/pasid.c | 69 ++------------------------------------------- drivers/iommu/intel/pasid.h | 1 - include/linux/intel-iommu.h | 3 -- 4 files changed, 3 insertions(+), 94 deletions(-) (limited to 'include/linux') diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 44016594831d..5c0dce78586a 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -320,30 +320,6 @@ EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped); DEFINE_SPINLOCK(device_domain_lock); static LIST_HEAD(device_domain_list); -/* - * Iterate over elements in device_domain_list and call the specified - * callback @fn against each element. - */ -int for_each_device_domain(int (*fn)(struct device_domain_info *info, - void *data), void *data) -{ - int ret = 0; - unsigned long flags; - struct device_domain_info *info; - - spin_lock_irqsave(&device_domain_lock, flags); - list_for_each_entry(info, &device_domain_list, global) { - ret = fn(info, data); - if (ret) { - spin_unlock_irqrestore(&device_domain_lock, flags); - return ret; - } - } - spin_unlock_irqrestore(&device_domain_lock, flags); - - return 0; -} - const struct iommu_ops intel_iommu_ops; static bool translation_pre_enabled(struct intel_iommu *iommu) diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index cb4c1d0cf25c..17cad7c1f62d 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -86,54 +86,6 @@ void vcmd_free_pasid(struct intel_iommu *iommu, u32 pasid) /* * Per device pasid table management: */ -static inline void -device_attach_pasid_table(struct device_domain_info *info, - struct pasid_table *pasid_table) -{ - info->pasid_table = pasid_table; - list_add(&info->table, &pasid_table->dev); -} - -static inline void -device_detach_pasid_table(struct device_domain_info *info, - struct pasid_table *pasid_table) -{ - info->pasid_table = NULL; - list_del(&info->table); -} - -struct pasid_table_opaque { - struct pasid_table **pasid_table; - int segment; - int bus; - int devfn; -}; - -static int search_pasid_table(struct device_domain_info *info, void *opaque) -{ - struct pasid_table_opaque *data = opaque; - - if (info->iommu->segment == data->segment && - info->bus == data->bus && - info->devfn == data->devfn && - info->pasid_table) { - *data->pasid_table = info->pasid_table; - return 1; - } - - return 0; -} - -static int get_alias_pasid_table(struct pci_dev *pdev, u16 alias, void *opaque) -{ - struct pasid_table_opaque *data = opaque; - - data->segment = pci_domain_nr(pdev->bus); - data->bus = PCI_BUS_NUM(alias); - data->devfn = alias & 0xff; - - return for_each_device_domain(&search_pasid_table, data); -} /* * Allocate a pasid table for @dev. It should be called in a @@ -143,28 +95,18 @@ int intel_pasid_alloc_table(struct device *dev) { struct device_domain_info *info; struct pasid_table *pasid_table; - struct pasid_table_opaque data; struct page *pages; u32 max_pasid = 0; - int ret, order; - int size; + int order, size; might_sleep(); info = dev_iommu_priv_get(dev); if (WARN_ON(!info || !dev_is_pci(dev) || info->pasid_table)) return -EINVAL; - /* DMA alias device already has a pasid table, use it: */ - data.pasid_table = &pasid_table; - ret = pci_for_each_dma_alias(to_pci_dev(dev), - &get_alias_pasid_table, &data); - if (ret) - goto attach_out; - pasid_table = kzalloc(sizeof(*pasid_table), GFP_KERNEL); if (!pasid_table) return -ENOMEM; - INIT_LIST_HEAD(&pasid_table->dev); if (info->pasid_supported) max_pasid = min_t(u32, pci_max_pasids(to_pci_dev(dev)), @@ -182,9 +124,7 @@ int intel_pasid_alloc_table(struct device *dev) pasid_table->table = page_address(pages); pasid_table->order = order; pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3); - -attach_out: - device_attach_pasid_table(info, pasid_table); + info->pasid_table = pasid_table; return 0; } @@ -202,10 +142,7 @@ void intel_pasid_free_table(struct device *dev) return; pasid_table = info->pasid_table; - device_detach_pasid_table(info, pasid_table); - - if (!list_empty(&pasid_table->dev)) - return; + info->pasid_table = NULL; /* Free scalable mode PASID directory tables: */ dir = pasid_table->table; diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 583ea67fc783..bf5b937848b4 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -74,7 +74,6 @@ struct pasid_table { void *table; /* pasid table pointer */ int order; /* page order of pasid table */ u32 max_pasid; /* max pasid */ - struct list_head dev; /* device list */ }; /* Get PRESENT bit of a PASID directory entry. */ diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index 4f29139bbfc3..5fcf89faa31a 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -612,7 +612,6 @@ struct intel_iommu { struct device_domain_info { struct list_head link; /* link to domain siblings */ struct list_head global; /* link to global list */ - struct list_head table; /* link to pasid table */ u32 segment; /* PCI segment number */ u8 bus; /* PCI bus number */ u8 devfn; /* PCI devfn number */ @@ -729,8 +728,6 @@ extern int dmar_ir_support(void); void *alloc_pgtable_page(int node); void free_pgtable_page(void *vaddr); struct intel_iommu *domain_get_iommu(struct dmar_domain *domain); -int for_each_device_domain(int (*fn)(struct device_domain_info *info, - void *data), void *data); void iommu_flush_write_buffer(struct intel_iommu *iommu); int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev); struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn); -- cgit From 5c629dc9609dc43492a7bc8060cc6120875bf096 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 6 Jul 2022 10:05:05 -0700 Subject: nvme: use struct group for generic command dwords This will allow the trace event to know the full size of the data intended to be copied and silence read overflow checks. Reported-by: John Garry Suggested-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/trace.h | 2 +- include/linux/nvme.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h index b5f85259461a..37c7f4c89f92 100644 --- a/drivers/nvme/host/trace.h +++ b/drivers/nvme/host/trace.h @@ -69,7 +69,7 @@ TRACE_EVENT(nvme_setup_cmd, __entry->metadata = !!blk_integrity_rq(req); __entry->fctype = cmd->fabrics.fctype; __assign_disk_name(__entry->disk, req->q->disk); - memcpy(__entry->cdw10, &cmd->common.cdw10, + memcpy(__entry->cdw10, &cmd->common.cdws, sizeof(__entry->cdw10)); ), TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%x, cmd=(%s %s)", diff --git a/include/linux/nvme.h b/include/linux/nvme.h index e3934003f239..07cfc922f8e4 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -906,12 +906,14 @@ struct nvme_common_command { __le32 cdw2[2]; __le64 metadata; union nvme_data_ptr dptr; + struct_group(cdws, __le32 cdw10; __le32 cdw11; __le32 cdw12; __le32 cdw13; __le32 cdw14; __le32 cdw15; + ); }; struct nvme_rw_command { -- cgit From e64242caef18b4a5840b0e7a9bff37abd4f4f933 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 25 Jun 2022 13:00:34 +0200 Subject: fbcon: Prevent that screen size is smaller than font size We need to prevent that users configure a screen size which is smaller than the currently selected font size. Otherwise rendering chars on the screen will access memory outside the graphics memory region. This patch adds a new function fbcon_modechange_possible() which implements this check and which later may be extended with other checks if necessary. The new function is called from the FBIOPUT_VSCREENINFO ioctl handler in fbmem.c, which will return -EINVAL if userspace asked for a too small screen size. Signed-off-by: Helge Deller Reviewed-by: Geert Uytterhoeven Cc: stable@vger.kernel.org # v5.4+ --- drivers/video/fbdev/core/fbcon.c | 28 ++++++++++++++++++++++++++++ drivers/video/fbdev/core/fbmem.c | 4 +++- include/linux/fbcon.h | 4 ++++ 3 files changed, 35 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index a33532564393..5632870a9aeb 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -2736,6 +2736,34 @@ void fbcon_update_vcs(struct fb_info *info, bool all) } EXPORT_SYMBOL(fbcon_update_vcs); +/* let fbcon check if it supports a new screen resolution */ +int fbcon_modechange_possible(struct fb_info *info, struct fb_var_screeninfo *var) +{ + struct fbcon_ops *ops = info->fbcon_par; + struct vc_data *vc; + unsigned int i; + + WARN_CONSOLE_UNLOCKED(); + + if (!ops) + return 0; + + /* prevent setting a screen size which is smaller than font size */ + for (i = first_fb_vc; i <= last_fb_vc; i++) { + vc = vc_cons[i].d; + if (!vc || vc->vc_mode != KD_TEXT || + registered_fb[con2fb_map[i]] != info) + continue; + + if (vc->vc_font.width > FBCON_SWAP(var->rotate, var->xres, var->yres) || + vc->vc_font.height > FBCON_SWAP(var->rotate, var->yres, var->xres)) + return -EINVAL; + } + + return 0; +} +EXPORT_SYMBOL_GPL(fbcon_modechange_possible); + int fbcon_mode_deleted(struct fb_info *info, struct fb_videomode *mode) { diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index ee63b5b47290..7dc6848a96bb 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -1107,7 +1107,9 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd, return -EFAULT; console_lock(); lock_fb_info(info); - ret = fb_set_var(info, &var); + ret = fbcon_modechange_possible(info, &var); + if (!ret) + ret = fb_set_var(info, &var); if (!ret) fbcon_update_vcs(info, var.activate & FB_ACTIVATE_ALL); unlock_fb_info(info); diff --git a/include/linux/fbcon.h b/include/linux/fbcon.h index ff5596dd30f8..2382dec6d6ab 100644 --- a/include/linux/fbcon.h +++ b/include/linux/fbcon.h @@ -15,6 +15,8 @@ void fbcon_new_modelist(struct fb_info *info); void fbcon_get_requirement(struct fb_info *info, struct fb_blit_caps *caps); void fbcon_fb_blanked(struct fb_info *info, int blank); +int fbcon_modechange_possible(struct fb_info *info, + struct fb_var_screeninfo *var); void fbcon_update_vcs(struct fb_info *info, bool all); void fbcon_remap_all(struct fb_info *info); int fbcon_set_con2fb_map_ioctl(void __user *argp); @@ -33,6 +35,8 @@ static inline void fbcon_new_modelist(struct fb_info *info) {} static inline void fbcon_get_requirement(struct fb_info *info, struct fb_blit_caps *caps) {} static inline void fbcon_fb_blanked(struct fb_info *info, int blank) {} +static inline int fbcon_modechange_possible(struct fb_info *info, + struct fb_var_screeninfo *var) { return 0; } static inline void fbcon_update_vcs(struct fb_info *info, bool all) {} static inline void fbcon_remap_all(struct fb_info *info) {} static inline int fbcon_set_con2fb_map_ioctl(void __user *argp) { return 0; } -- cgit From d5b36a4dbd06c5e8e36ca8ccc552f679069e2946 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 11 Jul 2022 18:16:25 +0200 Subject: fix race between exit_itimers() and /proc/pid/timers As Chris explains, the comment above exit_itimers() is not correct, we can race with proc_timers_seq_ops. Change exit_itimers() to clear signal->posix_timers with ->siglock held. Cc: Reported-by: chris@accessvector.net Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- include/linux/sched/task.h | 2 +- kernel/exit.c | 2 +- kernel/time/posix-timers.c | 19 ++++++++++++++----- 4 files changed, 17 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index 0989fb8472a1..778123259e42 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1301,7 +1301,7 @@ int begin_new_exec(struct linux_binprm * bprm) bprm->mm = NULL; #ifdef CONFIG_POSIX_TIMERS - exit_itimers(me->signal); + exit_itimers(me); flush_itimer_signals(); #endif diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 505aaf9fe477..81cab4b01edc 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -85,7 +85,7 @@ static inline void exit_thread(struct task_struct *tsk) extern __noreturn void do_group_exit(int); extern void exit_files(struct task_struct *); -extern void exit_itimers(struct signal_struct *); +extern void exit_itimers(struct task_struct *); extern pid_t kernel_clone(struct kernel_clone_args *kargs); struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node); diff --git a/kernel/exit.c b/kernel/exit.c index f072959fcab7..64c938ce36fe 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -766,7 +766,7 @@ void __noreturn do_exit(long code) #ifdef CONFIG_POSIX_TIMERS hrtimer_cancel(&tsk->signal->real_timer); - exit_itimers(tsk->signal); + exit_itimers(tsk); #endif if (tsk->mm) setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 1cd10b102c51..5dead89308b7 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -1051,15 +1051,24 @@ retry_delete: } /* - * This is called by do_exit or de_thread, only when there are no more - * references to the shared signal_struct. + * This is called by do_exit or de_thread, only when nobody else can + * modify the signal->posix_timers list. Yet we need sighand->siglock + * to prevent the race with /proc/pid/timers. */ -void exit_itimers(struct signal_struct *sig) +void exit_itimers(struct task_struct *tsk) { + struct list_head timers; struct k_itimer *tmr; - while (!list_empty(&sig->posix_timers)) { - tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); + if (list_empty(&tsk->signal->posix_timers)) + return; + + spin_lock_irq(&tsk->sighand->siglock); + list_replace_init(&tsk->signal->posix_timers, &timers); + spin_unlock_irq(&tsk->sighand->siglock); + + while (!list_empty(&timers)) { + tmr = list_first_entry(&timers, struct k_itimer, list); itimer_delete(tmr); } } -- cgit From af16df54b89dee72df253abc5e7b5e8a6d16c11c Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Wed, 13 Jul 2022 15:21:11 +0800 Subject: ima: force signature verification when CONFIG_KEXEC_SIG is configured Currently, an unsigned kernel could be kexec'ed when IMA arch specific policy is configured unless lockdown is enabled. Enforce kernel signature verification check in the kexec_file_load syscall when IMA arch specific policy is configured. Fixes: 99d5cadfde2b ("kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG and KEXEC_SIG_FORCE") Reported-and-suggested-by: Mimi Zohar Signed-off-by: Coiby Xu Signed-off-by: Mimi Zohar --- include/linux/kexec.h | 6 ++++++ kernel/kexec_file.c | 11 ++++++++++- security/integrity/ima/ima_efi.c | 2 ++ 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index ce6536f1d269..475683cd67f1 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -452,6 +452,12 @@ static inline int kexec_crash_loaded(void) { return 0; } #define kexec_in_progress false #endif /* CONFIG_KEXEC_CORE */ +#ifdef CONFIG_KEXEC_SIG +void set_kexec_sig_enforced(void); +#else +static inline void set_kexec_sig_enforced(void) {} +#endif + #endif /* !defined(__ASSEBMLY__) */ #endif /* LINUX_KEXEC_H */ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 145321a5e798..f9261c07b048 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -29,6 +29,15 @@ #include #include "kexec_internal.h" +#ifdef CONFIG_KEXEC_SIG +static bool sig_enforce = IS_ENABLED(CONFIG_KEXEC_SIG_FORCE); + +void set_kexec_sig_enforced(void) +{ + sig_enforce = true; +} +#endif + static int kexec_calculate_store_digests(struct kimage *image); /* @@ -159,7 +168,7 @@ kimage_validate_signature(struct kimage *image) image->kernel_buf_len); if (ret) { - if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) { + if (sig_enforce) { pr_notice("Enforced kernel signature verification failed (%d).\n", ret); return ret; } diff --git a/security/integrity/ima/ima_efi.c b/security/integrity/ima/ima_efi.c index 71786d01946f..9db66fe310d4 100644 --- a/security/integrity/ima/ima_efi.c +++ b/security/integrity/ima/ima_efi.c @@ -67,6 +67,8 @@ const char * const *arch_get_ima_policy(void) if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot()) { if (IS_ENABLED(CONFIG_MODULE_SIG)) set_module_sig_enforced(); + if (IS_ENABLED(CONFIG_KEXEC_SIG)) + set_kexec_sig_enforced(); return sb_arch_rules; } return NULL; -- cgit From fac47b43c760ea90e64b895dba60df0327be7775 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 11 Jul 2022 12:11:21 +0800 Subject: netfs: do not unlock and put the folio twice check_write_begin() will unlock and put the folio when return non-zero. So we should avoid unlocking and putting it twice in netfs layer. Change the way ->check_write_begin() works in the following two ways: (1) Pass it a pointer to the folio pointer, allowing it to unlock and put the folio prior to doing the stuff it wants to do, provided it clears the folio pointer. (2) Change the return values such that 0 with folio pointer set means continue, 0 with folio pointer cleared means re-get and all error codes indicating an error (no special treatment for -EAGAIN). [ bagasdotme: use Sphinx code text syntax for *foliop pointer ] Cc: stable@vger.kernel.org Link: https://tracker.ceph.com/issues/56423 Link: https://lore.kernel.org/r/cf169f43-8ee7-8697-25da-0204d1b4343e@redhat.com Co-developed-by: David Howells Signed-off-by: Xiubo Li Signed-off-by: David Howells Signed-off-by: Bagas Sanjaya Signed-off-by: Ilya Dryomov --- Documentation/filesystems/netfs_library.rst | 8 +++++--- fs/afs/file.c | 2 +- fs/ceph/addr.c | 11 ++++++----- fs/netfs/buffered_read.c | 17 ++++++++++------- include/linux/netfs.h | 2 +- 5 files changed, 23 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst index 4d19b19bcc08..73a4176144b3 100644 --- a/Documentation/filesystems/netfs_library.rst +++ b/Documentation/filesystems/netfs_library.rst @@ -301,7 +301,7 @@ through which it can issue requests and negotiate:: void (*issue_read)(struct netfs_io_subrequest *subreq); bool (*is_still_valid)(struct netfs_io_request *rreq); int (*check_write_begin)(struct file *file, loff_t pos, unsigned len, - struct folio *folio, void **_fsdata); + struct folio **foliop, void **_fsdata); void (*done)(struct netfs_io_request *rreq); }; @@ -381,8 +381,10 @@ The operations are as follows: allocated/grabbed the folio to be modified to allow the filesystem to flush conflicting state before allowing it to be modified. - It should return 0 if everything is now fine, -EAGAIN if the folio should be - regrabbed and any other error code to abort the operation. + It may unlock and discard the folio it was given and set the caller's folio + pointer to NULL. It should return 0 if everything is now fine (``*foliop`` + left set) or the op should be retried (``*foliop`` cleared) and any other + error code to abort the operation. * ``done`` diff --git a/fs/afs/file.c b/fs/afs/file.c index 42118a4f3383..d1cfb235c4b9 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -375,7 +375,7 @@ static int afs_begin_cache_operation(struct netfs_io_request *rreq) } static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, - struct folio *folio, void **_fsdata) + struct folio **foliop, void **_fsdata) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6dee88815491..d6e5916138e4 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -63,7 +63,7 @@ (CONGESTION_ON_THRESH(congestion_kb) >> 2)) static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, - struct folio *folio, void **_fsdata); + struct folio **foliop, void **_fsdata); static inline struct ceph_snap_context *page_snap_context(struct page *page) { @@ -1288,18 +1288,19 @@ ceph_find_incompatible(struct page *page) } static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, - struct folio *folio, void **_fsdata) + struct folio **foliop, void **_fsdata) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc; - snapc = ceph_find_incompatible(folio_page(folio, 0)); + snapc = ceph_find_incompatible(folio_page(*foliop, 0)); if (snapc) { int r; - folio_unlock(folio); - folio_put(folio); + folio_unlock(*foliop); + folio_put(*foliop); + *foliop = NULL; if (IS_ERR(snapc)) return PTR_ERR(snapc); diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 42f892c5712e..0ce535852151 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -319,8 +319,9 @@ zero_out: * conflicting writes once the folio is grabbed and locked. It is passed a * pointer to the fsdata cookie that gets returned to the VM to be passed to * write_end. It is permitted to sleep. It should return 0 if the request - * should go ahead; unlock the folio and return -EAGAIN to cause the folio to - * be regot; or return an error. + * should go ahead or it may return an error. It may also unlock and put the + * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0 + * will cause the folio to be re-got and the process to be retried. * * The calling netfs must initialise a netfs context contiguous to the vfs * inode before calling this. @@ -348,13 +349,13 @@ retry: if (ctx->ops->check_write_begin) { /* Allow the netfs (eg. ceph) to flush conflicts. */ - ret = ctx->ops->check_write_begin(file, pos, len, folio, _fsdata); + ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata); if (ret < 0) { trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin); - if (ret == -EAGAIN) - goto retry; goto error; } + if (!folio) + goto retry; } if (folio_test_uptodate(folio)) @@ -416,8 +417,10 @@ have_folio_no_wait: error_put: netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); error: - folio_unlock(folio); - folio_put(folio); + if (folio) { + folio_unlock(folio); + folio_put(folio); + } _leave(" = %d", ret); return ret; } diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 1773e5df8e65..1b18dfa52e48 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -214,7 +214,7 @@ struct netfs_request_ops { void (*issue_read)(struct netfs_io_subrequest *subreq); bool (*is_still_valid)(struct netfs_io_request *rreq); int (*check_write_begin)(struct file *file, loff_t pos, unsigned len, - struct folio *folio, void **_fsdata); + struct folio **foliop, void **_fsdata); void (*done)(struct netfs_io_request *rreq); }; -- cgit From 1b870fa5573e260bc74d19f381ab0dd971a8d8e7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 14 Jul 2022 07:27:31 -0400 Subject: kvm: stats: tell userspace which values are boolean Some of the statistics values exported by KVM are always only 0 or 1. It can be useful to export this fact to userspace so that it can track them specially (for example by polling the value every now and then to compute a % of time spent in a specific state). Therefore, add "boolean value" as a new "unit". While it is not exactly a unit, it walks and quacks like one. In particular, using the type would be wrong because boolean values could be instantaneous or peak values (e.g. "is the rmap allocated?") or even two-bucket histograms (e.g. "number of posted vs. non-posted interrupt injections"). Suggested-by: Amneesh Singh Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 6 ++++++ arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 11 ++++++++++- include/uapi/linux/kvm.h | 1 + 4 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 11e00a46c610..48bf6e49a7de 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5657,6 +5657,7 @@ by a string of size ``name_size``. #define KVM_STATS_UNIT_BYTES (0x1 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_SECONDS (0x2 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_CYCLES (0x3 << KVM_STATS_UNIT_SHIFT) + #define KVM_STATS_UNIT_BOOLEAN (0x4 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_CYCLES #define KVM_STATS_BASE_SHIFT 8 @@ -5724,6 +5725,11 @@ Bits 4-7 of ``flags`` encode the unit: It indicates that the statistics data is used to measure time or latency. * ``KVM_STATS_UNIT_CYCLES`` It indicates that the statistics data is used to measure CPU clock cycles. + * ``KVM_STATS_UNIT_BOOLEAN`` + It indicates that the statistic will always be either 0 or 1. Boolean + statistics of "peak" type will never go back from 1 to 0. Boolean + statistics can be linear histograms (with two buckets) but not logarithmic + histograms. Bits 8-11 of ``flags``, together with ``exponent``, encode the scale of the unit: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 26d0cac32f73..0c3e85e8fce9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -298,7 +298,7 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, directed_yield_successful), STATS_DESC_COUNTER(VCPU, preemption_reported), STATS_DESC_COUNTER(VCPU, preemption_other), - STATS_DESC_ICOUNTER(VCPU, guest_mode) + STATS_DESC_IBOOLEAN(VCPU, guest_mode) }; const struct kvm_stats_header kvm_vcpu_stats_header = { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 83cf7fd842e0..90a45ef7203b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1822,6 +1822,15 @@ struct _kvm_stats_desc { STATS_DESC_PEAK(SCOPE, name, KVM_STATS_UNIT_NONE, \ KVM_STATS_BASE_POW10, 0) +/* Instantaneous boolean value, read only */ +#define STATS_DESC_IBOOLEAN(SCOPE, name) \ + STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BOOLEAN, \ + KVM_STATS_BASE_POW10, 0) +/* Peak (sticky) boolean value, read/write */ +#define STATS_DESC_PBOOLEAN(SCOPE, name) \ + STATS_DESC_PEAK(SCOPE, name, KVM_STATS_UNIT_BOOLEAN, \ + KVM_STATS_BASE_POW10, 0) + /* Cumulative time in nanosecond */ #define STATS_DESC_TIME_NSEC(SCOPE, name) \ STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ @@ -1853,7 +1862,7 @@ struct _kvm_stats_desc { HALT_POLL_HIST_COUNT), \ STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_wait_hist, \ HALT_POLL_HIST_COUNT), \ - STATS_DESC_ICOUNTER(VCPU_GENERIC, blocking) + STATS_DESC_IBOOLEAN(VCPU_GENERIC, blocking) extern struct dentry *kvm_debugfs_dir; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 5088bd9f1922..811897dadcae 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -2083,6 +2083,7 @@ struct kvm_stats_header { #define KVM_STATS_UNIT_BYTES (0x1 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_SECONDS (0x2 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_CYCLES (0x3 << KVM_STATS_UNIT_SHIFT) +#define KVM_STATS_UNIT_BOOLEAN (0x4 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_CYCLES #define KVM_STATS_BASE_SHIFT 8 -- cgit