From 6549c46af8551b346bcc0b9043f93848319acd5c Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Sun, 27 Jun 2021 16:04:18 +0800 Subject: regulator: rt5033: Fix n_voltages settings for BUCK and LDO For linear regulators, the n_voltages should be (max - min) / step + 1. Buck voltage from 1v to 3V, per step 100mV, and vout mask is 0x1f. If value is from 20 to 31, the voltage will all be fixed to 3V. And LDO also, just vout range is different from 1.2v to 3v, step is the same. If value is from 18 to 31, the voltage will also be fixed to 3v. Signed-off-by: Axel Lin Reviewed-by: ChiYuan Huang Link: https://lore.kernel.org/r/20210627080418.1718127-1-axel.lin@ingics.com Signed-off-by: Mark Brown --- include/linux/mfd/rt5033-private.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mfd/rt5033-private.h b/include/linux/mfd/rt5033-private.h index 2d1895c3efbf..40a0c2dfb80f 100644 --- a/include/linux/mfd/rt5033-private.h +++ b/include/linux/mfd/rt5033-private.h @@ -200,13 +200,13 @@ enum rt5033_reg { #define RT5033_REGULATOR_BUCK_VOLTAGE_MIN 1000000U #define RT5033_REGULATOR_BUCK_VOLTAGE_MAX 3000000U #define RT5033_REGULATOR_BUCK_VOLTAGE_STEP 100000U -#define RT5033_REGULATOR_BUCK_VOLTAGE_STEP_NUM 32 +#define RT5033_REGULATOR_BUCK_VOLTAGE_STEP_NUM 21 /* RT5033 regulator LDO output voltage uV */ #define RT5033_REGULATOR_LDO_VOLTAGE_MIN 1200000U #define RT5033_REGULATOR_LDO_VOLTAGE_MAX 3000000U #define RT5033_REGULATOR_LDO_VOLTAGE_STEP 100000U -#define RT5033_REGULATOR_LDO_VOLTAGE_STEP_NUM 32 +#define RT5033_REGULATOR_LDO_VOLTAGE_STEP_NUM 19 /* RT5033 regulator SAFE LDO output voltage uV */ #define RT5033_REGULATOR_SAFE_LDO_VOLTAGE 4900000U -- cgit From 5d43f951b1ac797450bb4d230fdc960b739bea04 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Wed, 30 Jun 2021 16:11:52 +0800 Subject: ptp: add ptp virtual clock driver framework This patch is to add ptp virtual clock driver framework utilizing timecounter/cyclecounter. The patch just exports two essential APIs for PTP driver. - ptp_vclock_register() - ptp_vclock_unregister() Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- drivers/ptp/Makefile | 2 +- drivers/ptp/ptp_private.h | 15 ++++ drivers/ptp/ptp_vclock.c | 150 +++++++++++++++++++++++++++++++++++++++ include/linux/ptp_clock_kernel.h | 4 +- 4 files changed, 169 insertions(+), 2 deletions(-) create mode 100644 drivers/ptp/ptp_vclock.c (limited to 'include/linux') diff --git a/drivers/ptp/Makefile b/drivers/ptp/Makefile index 8673d1743faa..28a6fe342d3e 100644 --- a/drivers/ptp/Makefile +++ b/drivers/ptp/Makefile @@ -3,7 +3,7 @@ # Makefile for PTP 1588 clock support. # -ptp-y := ptp_clock.o ptp_chardev.o ptp_sysfs.o +ptp-y := ptp_clock.o ptp_chardev.o ptp_sysfs.o ptp_vclock.o ptp_kvm-$(CONFIG_X86) := ptp_kvm_x86.o ptp_kvm_common.o ptp_kvm-$(CONFIG_HAVE_ARM_SMCCC) := ptp_kvm_arm.o ptp_kvm_common.o obj-$(CONFIG_PTP_1588_CLOCK) += ptp.o diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h index 6b97155148f1..853b79b6b30e 100644 --- a/drivers/ptp/ptp_private.h +++ b/drivers/ptp/ptp_private.h @@ -48,6 +48,19 @@ struct ptp_clock { struct kthread_delayed_work aux_work; }; +#define info_to_vclock(d) container_of((d), struct ptp_vclock, info) +#define cc_to_vclock(d) container_of((d), struct ptp_vclock, cc) +#define dw_to_vclock(d) container_of((d), struct ptp_vclock, refresh_work) + +struct ptp_vclock { + struct ptp_clock *pclock; + struct ptp_clock_info info; + struct ptp_clock *clock; + struct cyclecounter cc; + struct timecounter tc; + spinlock_t lock; /* protects tc/cc */ +}; + /* * The function queue_cnt() is safe for readers to call without * holding q->lock. Readers use this function to verify that the queue @@ -89,4 +102,6 @@ extern const struct attribute_group *ptp_groups[]; int ptp_populate_pin_groups(struct ptp_clock *ptp); void ptp_cleanup_pin_groups(struct ptp_clock *ptp); +struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock); +void ptp_vclock_unregister(struct ptp_vclock *vclock); #endif diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c new file mode 100644 index 000000000000..fc9205cc504d --- /dev/null +++ b/drivers/ptp/ptp_vclock.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * PTP virtual clock driver + * + * Copyright 2021 NXP + */ +#include +#include "ptp_private.h" + +#define PTP_VCLOCK_CC_SHIFT 31 +#define PTP_VCLOCK_CC_MULT (1 << PTP_VCLOCK_CC_SHIFT) +#define PTP_VCLOCK_FADJ_SHIFT 9 +#define PTP_VCLOCK_FADJ_DENOMINATOR 15625ULL +#define PTP_VCLOCK_REFRESH_INTERVAL (HZ * 2) + +static int ptp_vclock_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) +{ + struct ptp_vclock *vclock = info_to_vclock(ptp); + unsigned long flags; + s64 adj; + + adj = (s64)scaled_ppm << PTP_VCLOCK_FADJ_SHIFT; + adj = div_s64(adj, PTP_VCLOCK_FADJ_DENOMINATOR); + + spin_lock_irqsave(&vclock->lock, flags); + timecounter_read(&vclock->tc); + vclock->cc.mult = PTP_VCLOCK_CC_MULT + adj; + spin_unlock_irqrestore(&vclock->lock, flags); + + return 0; +} + +static int ptp_vclock_adjtime(struct ptp_clock_info *ptp, s64 delta) +{ + struct ptp_vclock *vclock = info_to_vclock(ptp); + unsigned long flags; + + spin_lock_irqsave(&vclock->lock, flags); + timecounter_adjtime(&vclock->tc, delta); + spin_unlock_irqrestore(&vclock->lock, flags); + + return 0; +} + +static int ptp_vclock_gettime(struct ptp_clock_info *ptp, + struct timespec64 *ts) +{ + struct ptp_vclock *vclock = info_to_vclock(ptp); + unsigned long flags; + u64 ns; + + spin_lock_irqsave(&vclock->lock, flags); + ns = timecounter_read(&vclock->tc); + spin_unlock_irqrestore(&vclock->lock, flags); + *ts = ns_to_timespec64(ns); + + return 0; +} + +static int ptp_vclock_settime(struct ptp_clock_info *ptp, + const struct timespec64 *ts) +{ + struct ptp_vclock *vclock = info_to_vclock(ptp); + u64 ns = timespec64_to_ns(ts); + unsigned long flags; + + spin_lock_irqsave(&vclock->lock, flags); + timecounter_init(&vclock->tc, &vclock->cc, ns); + spin_unlock_irqrestore(&vclock->lock, flags); + + return 0; +} + +static long ptp_vclock_refresh(struct ptp_clock_info *ptp) +{ + struct ptp_vclock *vclock = info_to_vclock(ptp); + struct timespec64 ts; + + ptp_vclock_gettime(&vclock->info, &ts); + + return PTP_VCLOCK_REFRESH_INTERVAL; +} + +static const struct ptp_clock_info ptp_vclock_info = { + .owner = THIS_MODULE, + .name = "ptp virtual clock", + /* The maximum ppb value that long scaled_ppm can support */ + .max_adj = 32767999, + .adjfine = ptp_vclock_adjfine, + .adjtime = ptp_vclock_adjtime, + .gettime64 = ptp_vclock_gettime, + .settime64 = ptp_vclock_settime, + .do_aux_work = ptp_vclock_refresh, +}; + +static u64 ptp_vclock_read(const struct cyclecounter *cc) +{ + struct ptp_vclock *vclock = cc_to_vclock(cc); + struct ptp_clock *ptp = vclock->pclock; + struct timespec64 ts = {}; + + if (ptp->info->gettimex64) + ptp->info->gettimex64(ptp->info, &ts, NULL); + else + ptp->info->gettime64(ptp->info, &ts); + + return timespec64_to_ns(&ts); +} + +static const struct cyclecounter ptp_vclock_cc = { + .read = ptp_vclock_read, + .mask = CYCLECOUNTER_MASK(32), + .mult = PTP_VCLOCK_CC_MULT, + .shift = PTP_VCLOCK_CC_SHIFT, +}; + +struct ptp_vclock *ptp_vclock_register(struct ptp_clock *pclock) +{ + struct ptp_vclock *vclock; + + vclock = kzalloc(sizeof(*vclock), GFP_KERNEL); + if (!vclock) + return NULL; + + vclock->pclock = pclock; + vclock->info = ptp_vclock_info; + vclock->cc = ptp_vclock_cc; + + snprintf(vclock->info.name, PTP_CLOCK_NAME_LEN, "ptp%d_virt", + pclock->index); + + spin_lock_init(&vclock->lock); + + vclock->clock = ptp_clock_register(&vclock->info, &pclock->dev); + if (IS_ERR_OR_NULL(vclock->clock)) { + kfree(vclock); + return NULL; + } + + timecounter_init(&vclock->tc, &vclock->cc, 0); + ptp_schedule_worker(vclock->clock, PTP_VCLOCK_REFRESH_INTERVAL); + + return vclock; +} + +void ptp_vclock_unregister(struct ptp_vclock *vclock) +{ + ptp_clock_unregister(vclock->clock); + kfree(vclock); +} diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index aba237c0b3a2..b6fb771ee524 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -11,7 +11,9 @@ #include #include #include +#include +#define PTP_CLOCK_NAME_LEN 32 /** * struct ptp_clock_request - request PTP clock event * @@ -134,7 +136,7 @@ struct ptp_system_timestamp { struct ptp_clock_info { struct module *owner; - char name[16]; + char name[PTP_CLOCK_NAME_LEN]; s32 max_adj; int n_alarm; int n_ext_ts; -- cgit From acb288e8047b7569fbc9af6fa6e9405315345103 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Wed, 30 Jun 2021 16:11:55 +0800 Subject: ptp: add kernel API ptp_get_vclocks_index() Add kernel API ptp_get_vclocks_index() to get all ptp vclocks index on pclock. This is preparation for supporting ptp vclocks info query through ethtool. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- drivers/ptp/ptp_clock.c | 3 ++- drivers/ptp/ptp_private.h | 2 ++ drivers/ptp/ptp_vclock.c | 35 +++++++++++++++++++++++++++++++++++ include/linux/ptp_clock_kernel.h | 14 ++++++++++++++ 4 files changed, 53 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index 9205a9362a9d..f012fa581cf4 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -24,10 +24,11 @@ #define PTP_PPS_EVENT PPS_CAPTUREASSERT #define PTP_PPS_MODE (PTP_PPS_DEFAULTS | PPS_CANWAIT | PPS_TSFMT_TSPEC) +struct class *ptp_class; + /* private globals */ static dev_t ptp_devt; -static struct class *ptp_class; static DEFINE_IDA(ptp_clocks_map); diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h index f75fadd9b244..dba6be477067 100644 --- a/drivers/ptp/ptp_private.h +++ b/drivers/ptp/ptp_private.h @@ -96,6 +96,8 @@ static inline bool ptp_vclock_in_use(struct ptp_clock *ptp) return in_use; } +extern struct class *ptp_class; + /* * see ptp_chardev.c */ diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c index fc9205cc504d..cefab29a0592 100644 --- a/drivers/ptp/ptp_vclock.c +++ b/drivers/ptp/ptp_vclock.c @@ -148,3 +148,38 @@ void ptp_vclock_unregister(struct ptp_vclock *vclock) ptp_clock_unregister(vclock->clock); kfree(vclock); } + +int ptp_get_vclocks_index(int pclock_index, int **vclock_index) +{ + char name[PTP_CLOCK_NAME_LEN] = ""; + struct ptp_clock *ptp; + struct device *dev; + int num = 0; + + if (pclock_index < 0) + return num; + + snprintf(name, PTP_CLOCK_NAME_LEN, "ptp%d", pclock_index); + dev = class_find_device_by_name(ptp_class, name); + if (!dev) + return num; + + ptp = dev_get_drvdata(dev); + + if (mutex_lock_interruptible(&ptp->n_vclocks_mux)) { + put_device(dev); + return num; + } + + *vclock_index = kzalloc(sizeof(int) * ptp->n_vclocks, GFP_KERNEL); + if (!(*vclock_index)) + goto out; + + memcpy(*vclock_index, ptp->vclock_index, sizeof(int) * ptp->n_vclocks); + num = ptp->n_vclocks; +out: + mutex_unlock(&ptp->n_vclocks_mux); + put_device(dev); + return num; +} +EXPORT_SYMBOL(ptp_get_vclocks_index); diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index b6fb771ee524..300a984fec87 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -306,6 +306,18 @@ int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay); */ void ptp_cancel_worker_sync(struct ptp_clock *ptp); +/** + * ptp_get_vclocks_index() - get all vclocks index on pclock, and + * caller is responsible to free memory + * of vclock_index + * + * @pclock_index: phc index of ptp pclock. + * @vclock_index: pointer to pointer of vclock index. + * + * return number of vclocks. + */ +int ptp_get_vclocks_index(int pclock_index, int **vclock_index); + #else static inline struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, struct device *parent) @@ -325,6 +337,8 @@ static inline int ptp_schedule_worker(struct ptp_clock *ptp, { return -EOPNOTSUPP; } static inline void ptp_cancel_worker_sync(struct ptp_clock *ptp) { } +static inline int ptp_get_vclocks_index(int pclock_index, int **vclock_index) +{ return 0; } #endif -- cgit From c156174a67070042d51d2c866146d3c934d5468c Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Wed, 30 Jun 2021 16:11:56 +0800 Subject: ethtool: add a new command for getting PHC virtual clocks Add an interface for getting PHC (PTP Hardware Clock) virtual clocks, which are based on PHC physical clock providing hardware timestamp to network packets. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 22 +++++++ include/linux/ethtool.h | 10 +++ include/uapi/linux/ethtool_netlink.h | 15 +++++ net/ethtool/Makefile | 2 +- net/ethtool/common.c | 13 ++++ net/ethtool/netlink.c | 10 +++ net/ethtool/netlink.h | 2 + net/ethtool/phc_vclocks.c | 94 ++++++++++++++++++++++++++++ 8 files changed, 167 insertions(+), 1 deletion(-) create mode 100644 net/ethtool/phc_vclocks.c (limited to 'include/linux') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 6ea91e41593f..c86628e6a235 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -212,6 +212,7 @@ Userspace to kernel: ``ETHTOOL_MSG_FEC_SET`` set FEC settings ``ETHTOOL_MSG_MODULE_EEPROM_GET`` read SFP module EEPROM ``ETHTOOL_MSG_STATS_GET`` get standard statistics + ``ETHTOOL_MSG_PHC_VCLOCKS_GET`` get PHC virtual clocks info ===================================== ================================ Kernel to userspace: @@ -250,6 +251,7 @@ Kernel to userspace: ``ETHTOOL_MSG_FEC_NTF`` FEC settings ``ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY`` read SFP module EEPROM ``ETHTOOL_MSG_STATS_GET_REPLY`` standard statistics + ``ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY`` PHC virtual clocks info ======================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -1477,6 +1479,25 @@ Low and high bounds are inclusive, for example: etherStatsPkts512to1023Octets 512 1023 ============================= ==== ==== +PHC_VCLOCKS_GET +=============== + +Query device PHC virtual clocks information. + +Request contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_PHC_VCLOCKS_HEADER`` nested request header + ==================================== ====== ========================== + +Kernel response contents: + + ==================================== ====== ========================== + ``ETHTOOL_A_PHC_VCLOCKS_HEADER`` nested reply header + ``ETHTOOL_A_PHC_VCLOCKS_NUM`` u32 PHC virtual clocks number + ``ETHTOOL_A_PHC_VCLOCKS_INDEX`` s32 PHC index array + ==================================== ====== ========================== + Request translation =================== @@ -1575,4 +1596,5 @@ are netlink only. n/a ``ETHTOOL_MSG_CABLE_TEST_ACT`` n/a ``ETHTOOL_MSG_CABLE_TEST_TDR_ACT`` n/a ``ETHTOOL_MSG_TUNNEL_INFO_GET`` + n/a ``ETHTOOL_MSG_PHC_VCLOCKS_GET`` =================================== ===================================== diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 29dbb603bc91..232daaec56e4 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -757,6 +757,16 @@ void ethtool_params_from_link_mode(struct ethtool_link_ksettings *link_ksettings, enum ethtool_link_mode_bit_indices link_mode); +/** + * ethtool_get_phc_vclocks - Derive phc vclocks information, and caller + * is responsible to free memory of vclock_index + * @dev: pointer to net_device structure + * @vclock_index: pointer to pointer of vclock index + * + * Return number of phc vclocks + */ +int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index); + /** * ethtool_sprintf - Write formatted string to ethtool string data * @data: Pointer to start of string to update diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index c7135c9c37a5..b3b93710eff7 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -46,6 +46,7 @@ enum { ETHTOOL_MSG_FEC_SET, ETHTOOL_MSG_MODULE_EEPROM_GET, ETHTOOL_MSG_STATS_GET, + ETHTOOL_MSG_PHC_VCLOCKS_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -88,6 +89,7 @@ enum { ETHTOOL_MSG_FEC_NTF, ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY, ETHTOOL_MSG_STATS_GET_REPLY, + ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -440,6 +442,19 @@ enum { ETHTOOL_A_TSINFO_MAX = (__ETHTOOL_A_TSINFO_CNT - 1) }; +/* PHC VCLOCKS */ + +enum { + ETHTOOL_A_PHC_VCLOCKS_UNSPEC, + ETHTOOL_A_PHC_VCLOCKS_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_PHC_VCLOCKS_NUM, /* u32 */ + ETHTOOL_A_PHC_VCLOCKS_INDEX, /* array, s32 */ + + /* add new constants above here */ + __ETHTOOL_A_PHC_VCLOCKS_CNT, + ETHTOOL_A_PHC_VCLOCKS_MAX = (__ETHTOOL_A_PHC_VCLOCKS_CNT - 1) +}; + /* CABLE TEST */ enum { diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 723c9a8a8cdf..0a19470efbfb 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ - tunnels.o fec.o eeprom.o stats.o + tunnels.o fec.o eeprom.o stats.o phc_vclocks.o diff --git a/net/ethtool/common.c b/net/ethtool/common.c index f9dcbad84788..798231b07676 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "common.h" @@ -554,6 +555,18 @@ int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info) return 0; } +int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index) +{ + struct ethtool_ts_info info = { }; + int num = 0; + + if (!__ethtool_get_ts_info(dev, &info)) + num = ptp_get_vclocks_index(info.phc_index, vclock_index); + + return num; +} +EXPORT_SYMBOL(ethtool_get_phc_vclocks); + const struct ethtool_phy_ops *ethtool_phy_ops; void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops) diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index a7346346114f..73e0f5b626bf 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -248,6 +248,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_TSINFO_GET] = ðnl_tsinfo_request_ops, [ETHTOOL_MSG_MODULE_EEPROM_GET] = ðnl_module_eeprom_request_ops, [ETHTOOL_MSG_STATS_GET] = ðnl_stats_request_ops, + [ETHTOOL_MSG_PHC_VCLOCKS_GET] = ðnl_phc_vclocks_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -958,6 +959,15 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_stats_get_policy, .maxattr = ARRAY_SIZE(ethnl_stats_get_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_phc_vclocks_get_policy, + .maxattr = ARRAY_SIZE(ethnl_phc_vclocks_get_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 3e25a47fd482..3fc395c86702 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -347,6 +347,7 @@ extern const struct ethnl_request_ops ethnl_tsinfo_request_ops; extern const struct ethnl_request_ops ethnl_fec_request_ops; extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops; extern const struct ethnl_request_ops ethnl_stats_request_ops; +extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -382,6 +383,7 @@ extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1]; extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1]; extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS + 1]; extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1]; +extern const struct nla_policy ethnl_phc_vclocks_get_policy[ETHTOOL_A_PHC_VCLOCKS_HEADER + 1]; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); diff --git a/net/ethtool/phc_vclocks.c b/net/ethtool/phc_vclocks.c new file mode 100644 index 000000000000..637b2f5297d5 --- /dev/null +++ b/net/ethtool/phc_vclocks.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2021 NXP + */ +#include "netlink.h" +#include "common.h" + +struct phc_vclocks_req_info { + struct ethnl_req_info base; +}; + +struct phc_vclocks_reply_data { + struct ethnl_reply_data base; + int num; + int *index; +}; + +#define PHC_VCLOCKS_REPDATA(__reply_base) \ + container_of(__reply_base, struct phc_vclocks_reply_data, base) + +const struct nla_policy ethnl_phc_vclocks_get_policy[] = { + [ETHTOOL_A_PHC_VCLOCKS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), +}; + +static int phc_vclocks_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + data->num = ethtool_get_phc_vclocks(dev, &data->index); + ethnl_ops_complete(dev); + + return ret; +} + +static int phc_vclocks_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct phc_vclocks_reply_data *data = + PHC_VCLOCKS_REPDATA(reply_base); + int len = 0; + + if (data->num > 0) { + len += nla_total_size(sizeof(u32)); + len += nla_total_size(sizeof(s32) * data->num); + } + + return len; +} + +static int phc_vclocks_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct phc_vclocks_reply_data *data = + PHC_VCLOCKS_REPDATA(reply_base); + + if (data->num <= 0) + return 0; + + if (nla_put_u32(skb, ETHTOOL_A_PHC_VCLOCKS_NUM, data->num) || + nla_put(skb, ETHTOOL_A_PHC_VCLOCKS_INDEX, + sizeof(s32) * data->num, data->index)) + return -EMSGSIZE; + + return 0; +} + +static void phc_vclocks_cleanup_data(struct ethnl_reply_data *reply_base) +{ + const struct phc_vclocks_reply_data *data = + PHC_VCLOCKS_REPDATA(reply_base); + + kfree(data->index); +} + +const struct ethnl_request_ops ethnl_phc_vclocks_request_ops = { + .request_cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET, + .reply_cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY, + .hdr_attr = ETHTOOL_A_PHC_VCLOCKS_HEADER, + .req_info_size = sizeof(struct phc_vclocks_req_info), + .reply_data_size = sizeof(struct phc_vclocks_reply_data), + + .prepare_data = phc_vclocks_prepare_data, + .reply_size = phc_vclocks_reply_size, + .fill_reply = phc_vclocks_fill_reply, + .cleanup_data = phc_vclocks_cleanup_data, +}; -- cgit From 895487a3a10fb3a177e20dcde875515d46ccd4df Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Wed, 30 Jun 2021 16:11:57 +0800 Subject: ptp: add kernel API ptp_convert_timestamp() Add kernel API ptp_convert_timestamp() to convert raw hardware timestamp to a specified ptp vclock time. Signed-off-by: Yangbo Lu Signed-off-by: David S. Miller --- drivers/ptp/ptp_vclock.c | 34 ++++++++++++++++++++++++++++++++++ include/linux/ptp_clock_kernel.h | 13 +++++++++++++ 2 files changed, 47 insertions(+) (limited to 'include/linux') diff --git a/drivers/ptp/ptp_vclock.c b/drivers/ptp/ptp_vclock.c index cefab29a0592..e0f87c57749a 100644 --- a/drivers/ptp/ptp_vclock.c +++ b/drivers/ptp/ptp_vclock.c @@ -183,3 +183,37 @@ out: return num; } EXPORT_SYMBOL(ptp_get_vclocks_index); + +void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps, + int vclock_index) +{ + char name[PTP_CLOCK_NAME_LEN] = ""; + struct ptp_vclock *vclock; + struct ptp_clock *ptp; + unsigned long flags; + struct device *dev; + u64 ns; + + snprintf(name, PTP_CLOCK_NAME_LEN, "ptp%d", vclock_index); + dev = class_find_device_by_name(ptp_class, name); + if (!dev) + return; + + ptp = dev_get_drvdata(dev); + if (!ptp->is_virtual_clock) { + put_device(dev); + return; + } + + vclock = info_to_vclock(ptp->info); + + ns = ktime_to_ns(hwtstamps->hwtstamp); + + spin_lock_irqsave(&vclock->lock, flags); + ns = timecounter_cyc2time(&vclock->tc, ns); + spin_unlock_irqrestore(&vclock->lock, flags); + + put_device(dev); + hwtstamps->hwtstamp = ns_to_ktime(ns); +} +EXPORT_SYMBOL(ptp_convert_timestamp); diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 300a984fec87..71fac9237725 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -12,6 +12,7 @@ #include #include #include +#include #define PTP_CLOCK_NAME_LEN 32 /** @@ -318,6 +319,15 @@ void ptp_cancel_worker_sync(struct ptp_clock *ptp); */ int ptp_get_vclocks_index(int pclock_index, int **vclock_index); +/** + * ptp_convert_timestamp() - convert timestamp to a ptp vclock time + * + * @hwtstamps: skb_shared_hwtstamps structure pointer + * @vclock_index: phc index of ptp vclock. + */ +void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps, + int vclock_index); + #else static inline struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, struct device *parent) @@ -339,6 +349,9 @@ static inline void ptp_cancel_worker_sync(struct ptp_clock *ptp) { } static inline int ptp_get_vclocks_index(int pclock_index, int **vclock_index) { return 0; } +static inline void ptp_convert_timestamp(struct skb_shared_hwtstamps *hwtstamps, + int vclock_index) +{ } #endif -- cgit From b2aae654a4794ef898ad33a179f341eb610f6b85 Mon Sep 17 00:00:00 2001 From: Xiaoliang Yang Date: Mon, 5 Jul 2021 18:26:54 +0800 Subject: net: stmmac: add mutex lock to protect est parameters Add a mutex lock to protect est structure parameters so that the EST parameters can be updated by other threads. Signed-off-by: Xiaoliang Yang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c | 12 +++++++++++- include/linux/stmmac.h | 1 + 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c index 596626c71189..2e3cdf540168 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c @@ -796,14 +796,18 @@ static int tc_setup_taprio(struct stmmac_priv *priv, GFP_KERNEL); if (!plat->est) return -ENOMEM; + + mutex_init(&priv->plat->est->lock); } else { memset(plat->est, 0, sizeof(*plat->est)); } size = qopt->num_entries; + mutex_lock(&priv->plat->est->lock); priv->plat->est->gcl_size = size; priv->plat->est->enable = qopt->enable; + mutex_unlock(&priv->plat->est->lock); for (i = 0; i < size; i++) { s64 delta_ns = qopt->entries[i].interval; @@ -834,6 +838,7 @@ static int tc_setup_taprio(struct stmmac_priv *priv, priv->plat->est->gcl[i] = delta_ns | (gates << wid); } + mutex_lock(&priv->plat->est->lock); /* Adjust for real system time */ priv->ptp_clock_ops.gettime64(&priv->ptp_clock_ops, ¤t_time); current_time_ns = timespec64_to_ktime(current_time); @@ -847,8 +852,10 @@ static int tc_setup_taprio(struct stmmac_priv *priv, priv->plat->est->ctr[0] = do_div(ctr, NSEC_PER_SEC); priv->plat->est->ctr[1] = (u32)ctr; - if (fpe && !priv->dma_cap.fpesel) + if (fpe && !priv->dma_cap.fpesel) { + mutex_unlock(&priv->plat->est->lock); return -EOPNOTSUPP; + } /* Actual FPE register configuration will be done after FPE handshake * is success. @@ -857,6 +864,7 @@ static int tc_setup_taprio(struct stmmac_priv *priv, ret = stmmac_est_configure(priv, priv->ioaddr, priv->plat->est, priv->plat->clk_ptp_rate); + mutex_unlock(&priv->plat->est->lock); if (ret) { netdev_err(priv->dev, "failed to configure EST\n"); goto disable; @@ -872,9 +880,11 @@ static int tc_setup_taprio(struct stmmac_priv *priv, return 0; disable: + mutex_lock(&priv->plat->est->lock); priv->plat->est->enable = false; stmmac_est_configure(priv, priv->ioaddr, priv->plat->est, priv->plat->clk_ptp_rate); + mutex_unlock(&priv->plat->est->lock); priv->plat->fpe_cfg->enable = false; stmmac_fpe_configure(priv, priv->ioaddr, diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index d5ae621d66ba..09157b8a5810 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -115,6 +115,7 @@ struct stmmac_axi { #define EST_GCL 1024 struct stmmac_est { + struct mutex lock; int enable; u32 btr_offset[2]; u32 btr[2]; -- cgit From e9e3720002f61cd637a49ecafae77cac230eefae Mon Sep 17 00:00:00 2001 From: Xiaoliang Yang Date: Mon, 5 Jul 2021 18:26:55 +0800 Subject: net: stmmac: ptp: update tas basetime after ptp adjust After adjusting the ptp time, the Qbv base time may be the past time of the new current time. dwmac5 hardware limited the base time cannot be set as past time. This patch add a btr_reserve to store the base time get from qopt, then calculate the base time and reset the Qbv configuration after ptp time adjust. Signed-off-by: Xiaoliang Yang Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c | 41 +++++++++++++++++++++++- drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c | 6 +++- include/linux/stmmac.h | 1 + 3 files changed, 46 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c index 4e86cdf2bc9f..580cc035536b 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c @@ -62,7 +62,8 @@ static int stmmac_adjust_time(struct ptp_clock_info *ptp, s64 delta) u32 sec, nsec; u32 quotient, reminder; int neg_adj = 0; - bool xmac; + bool xmac, est_rst = false; + int ret; xmac = priv->plat->has_gmac4 || priv->plat->has_xgmac; @@ -75,10 +76,48 @@ static int stmmac_adjust_time(struct ptp_clock_info *ptp, s64 delta) sec = quotient; nsec = reminder; + /* If EST is enabled, disabled it before adjust ptp time. */ + if (priv->plat->est && priv->plat->est->enable) { + est_rst = true; + mutex_lock(&priv->plat->est->lock); + priv->plat->est->enable = false; + stmmac_est_configure(priv, priv->ioaddr, priv->plat->est, + priv->plat->clk_ptp_rate); + mutex_unlock(&priv->plat->est->lock); + } + spin_lock_irqsave(&priv->ptp_lock, flags); stmmac_adjust_systime(priv, priv->ptpaddr, sec, nsec, neg_adj, xmac); spin_unlock_irqrestore(&priv->ptp_lock, flags); + /* Caculate new basetime and re-configured EST after PTP time adjust. */ + if (est_rst) { + struct timespec64 current_time, time; + ktime_t current_time_ns, basetime; + u64 cycle_time; + + mutex_lock(&priv->plat->est->lock); + priv->ptp_clock_ops.gettime64(&priv->ptp_clock_ops, ¤t_time); + current_time_ns = timespec64_to_ktime(current_time); + time.tv_nsec = priv->plat->est->btr_reserve[0]; + time.tv_sec = priv->plat->est->btr_reserve[1]; + basetime = timespec64_to_ktime(time); + cycle_time = priv->plat->est->ctr[1] * NSEC_PER_SEC + + priv->plat->est->ctr[0]; + time = stmmac_calc_tas_basetime(basetime, + current_time_ns, + cycle_time); + + priv->plat->est->btr[0] = (u32)time.tv_nsec; + priv->plat->est->btr[1] = (u32)time.tv_sec; + priv->plat->est->enable = true; + ret = stmmac_est_configure(priv, priv->ioaddr, priv->plat->est, + priv->plat->clk_ptp_rate); + mutex_unlock(&priv->plat->est->lock); + if (ret) + netdev_err(priv->dev, "failed to configure EST\n"); + } + return 0; } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c index 2e3cdf540168..4f3b6437b114 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c @@ -739,7 +739,7 @@ static int tc_setup_taprio(struct stmmac_priv *priv, { u32 size, wid = priv->dma_cap.estwid, dep = priv->dma_cap.estdep; struct plat_stmmacenet_data *plat = priv->plat; - struct timespec64 time, current_time; + struct timespec64 time, current_time, qopt_time; ktime_t current_time_ns; bool fpe = false; int i, ret = 0; @@ -848,6 +848,10 @@ static int tc_setup_taprio(struct stmmac_priv *priv, priv->plat->est->btr[0] = (u32)time.tv_nsec; priv->plat->est->btr[1] = (u32)time.tv_sec; + qopt_time = ktime_to_timespec64(qopt->base_time); + priv->plat->est->btr_reserve[0] = (u32)qopt_time.tv_nsec; + priv->plat->est->btr_reserve[1] = (u32)qopt_time.tv_sec; + ctr = qopt->cycle_time; priv->plat->est->ctr[0] = do_div(ctr, NSEC_PER_SEC); priv->plat->est->ctr[1] = (u32)ctr; diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 09157b8a5810..a6f03b36fc4f 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -117,6 +117,7 @@ struct stmmac_axi { struct stmmac_est { struct mutex lock; int enable; + u32 btr_reserve[2]; u32 btr_offset[2]; u32 btr[2]; u32 ctr[2]; -- cgit From f263a81451c12da5a342d90572e317e611846f2c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 7 Jul 2021 15:38:47 -0700 Subject: bpf: Track subprog poke descriptors correctly and fix use-after-free Subprograms are calling map_poke_track(), but on program release there is no hook to call map_poke_untrack(). However, on program release, the aux memory (and poke descriptor table) is freed even though we still have a reference to it in the element list of the map aux data. When we run map_poke_run(), we then end up accessing free'd memory, triggering KASAN in prog_array_map_poke_run(): [...] [ 402.824689] BUG: KASAN: use-after-free in prog_array_map_poke_run+0xc2/0x34e [ 402.824698] Read of size 4 at addr ffff8881905a7940 by task hubble-fgs/4337 [ 402.824705] CPU: 1 PID: 4337 Comm: hubble-fgs Tainted: G I 5.12.0+ #399 [ 402.824715] Call Trace: [ 402.824719] dump_stack+0x93/0xc2 [ 402.824727] print_address_description.constprop.0+0x1a/0x140 [ 402.824736] ? prog_array_map_poke_run+0xc2/0x34e [ 402.824740] ? prog_array_map_poke_run+0xc2/0x34e [ 402.824744] kasan_report.cold+0x7c/0xd8 [ 402.824752] ? prog_array_map_poke_run+0xc2/0x34e [ 402.824757] prog_array_map_poke_run+0xc2/0x34e [ 402.824765] bpf_fd_array_map_update_elem+0x124/0x1a0 [...] The elements concerned are walked as follows: for (i = 0; i < elem->aux->size_poke_tab; i++) { poke = &elem->aux->poke_tab[i]; [...] The access to size_poke_tab is a 4 byte read, verified by checking offsets in the KASAN dump: [ 402.825004] The buggy address belongs to the object at ffff8881905a7800 which belongs to the cache kmalloc-1k of size 1024 [ 402.825008] The buggy address is located 320 bytes inside of 1024-byte region [ffff8881905a7800, ffff8881905a7c00) The pahole output of bpf_prog_aux: struct bpf_prog_aux { [...] /* --- cacheline 5 boundary (320 bytes) --- */ u32 size_poke_tab; /* 320 4 */ [...] In general, subprograms do not necessarily manage their own data structures. For example, BTF func_info and linfo are just pointers to the main program structure. This allows reference counting and cleanup to be done on the latter which simplifies their management a bit. The aux->poke_tab struct, however, did not follow this logic. The initial proposed fix for this use-after-free bug further embedded poke data tracking into the subprogram with proper reference counting. However, Daniel and Alexei questioned why we were treating these objects special; I agree, its unnecessary. The fix here removes the per subprogram poke table allocation and map tracking and instead simply points the aux->poke_tab pointer at the main programs poke table. This way, map tracking is simplified to the main program and we do not need to manage them per subprogram. This also means, bpf_prog_free_deferred(), which unwinds the program reference counting and kfrees objects, needs to ensure that we don't try to double free the poke_tab when free'ing the subprog structures. This is easily solved by NULL'ing the poke_tab pointer. The second detail is to ensure that per subprogram JIT logic only does fixups on poke_tab[] entries it owns. To do this, we add a pointer in the poke structure to point at the subprogram value so JITs can easily check while walking the poke_tab structure if the current entry belongs to the current program. The aux pointer is stable and therefore suitable for such comparison. On the jit_subprogs() error path, we omit cleaning up the poke->aux field because these are only ever referenced from the JIT side, but on error we will never make it to the JIT, so its fine to leave them dangling. Removing these pointers would complicate the error path for no reason. However, we do need to untrack all poke descriptors from the main program as otherwise they could race with the freeing of JIT memory from the subprograms. Lastly, a748c6975dea3 ("bpf: propagate poke descriptors to subprograms") had an off-by-one on the subprogram instruction index range check as it was testing 'insn_idx >= subprog_start && insn_idx <= subprog_end'. However, subprog_end is the next subprogram's start instruction. Fixes: a748c6975dea3 ("bpf: propagate poke descriptors to subprograms") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Co-developed-by: Daniel Borkmann Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210707223848.14580-2-john.fastabend@gmail.com --- arch/x86/net/bpf_jit_comp.c | 3 +++ include/linux/bpf.h | 1 + kernel/bpf/core.c | 8 +++++- kernel/bpf/verifier.c | 60 ++++++++++++++++----------------------------- 4 files changed, 32 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e835164189f1..4b951458c9fc 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -570,6 +570,9 @@ static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) for (i = 0; i < prog->aux->size_poke_tab; i++) { poke = &prog->aux->poke_tab[i]; + if (poke->aux && poke->aux != prog->aux) + continue; + WARN_ON_ONCE(READ_ONCE(poke->tailcall_target_stable)); if (poke->reason != BPF_POKE_REASON_TAIL_CALL) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f309fc1509f2..e8e2b0393ca9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -780,6 +780,7 @@ struct bpf_jit_poke_descriptor { void *tailcall_target; void *tailcall_bypass; void *bypass_addr; + void *aux; union { struct { struct bpf_map *map; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 034ad93a1ad7..9b1577498373 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2236,8 +2236,14 @@ static void bpf_prog_free_deferred(struct work_struct *work) #endif if (aux->dst_trampoline) bpf_trampoline_put(aux->dst_trampoline); - for (i = 0; i < aux->func_cnt; i++) + for (i = 0; i < aux->func_cnt; i++) { + /* We can just unlink the subprog poke descriptor table as + * it was originally linked to the main program and is also + * released along with it. + */ + aux->func[i]->aux->poke_tab = NULL; bpf_jit_free(aux->func[i]); + } if (aux->func_cnt) { kfree(aux->func); bpf_prog_unlock_free(aux->prog); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index be38bb930bf1..42a4063de7cd 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -12121,33 +12121,19 @@ static int jit_subprogs(struct bpf_verifier_env *env) goto out_free; func[i]->is_func = 1; func[i]->aux->func_idx = i; - /* the btf and func_info will be freed only at prog->aux */ + /* Below members will be freed only at prog->aux */ func[i]->aux->btf = prog->aux->btf; func[i]->aux->func_info = prog->aux->func_info; + func[i]->aux->poke_tab = prog->aux->poke_tab; + func[i]->aux->size_poke_tab = prog->aux->size_poke_tab; for (j = 0; j < prog->aux->size_poke_tab; j++) { - u32 insn_idx = prog->aux->poke_tab[j].insn_idx; - int ret; + struct bpf_jit_poke_descriptor *poke; - if (!(insn_idx >= subprog_start && - insn_idx <= subprog_end)) - continue; - - ret = bpf_jit_add_poke_descriptor(func[i], - &prog->aux->poke_tab[j]); - if (ret < 0) { - verbose(env, "adding tail call poke descriptor failed\n"); - goto out_free; - } - - func[i]->insnsi[insn_idx - subprog_start].imm = ret + 1; - - map_ptr = func[i]->aux->poke_tab[ret].tail_call.map; - ret = map_ptr->ops->map_poke_track(map_ptr, func[i]->aux); - if (ret < 0) { - verbose(env, "tracking tail call prog failed\n"); - goto out_free; - } + poke = &prog->aux->poke_tab[j]; + if (poke->insn_idx < subprog_end && + poke->insn_idx >= subprog_start) + poke->aux = func[i]->aux; } /* Use bpf_prog_F_tag to indicate functions in stack traces. @@ -12178,18 +12164,6 @@ static int jit_subprogs(struct bpf_verifier_env *env) cond_resched(); } - /* Untrack main program's aux structs so that during map_poke_run() - * we will not stumble upon the unfilled poke descriptors; each - * of the main program's poke descs got distributed across subprogs - * and got tracked onto map, so we are sure that none of them will - * be missed after the operation below - */ - for (i = 0; i < prog->aux->size_poke_tab; i++) { - map_ptr = prog->aux->poke_tab[i].tail_call.map; - - map_ptr->ops->map_poke_untrack(map_ptr, prog->aux); - } - /* at this point all bpf functions were successfully JITed * now populate all bpf_calls with correct addresses and * run last pass of JIT @@ -12267,14 +12241,22 @@ static int jit_subprogs(struct bpf_verifier_env *env) bpf_prog_jit_attempt_done(prog); return 0; out_free: + /* We failed JIT'ing, so at this point we need to unregister poke + * descriptors from subprogs, so that kernel is not attempting to + * patch it anymore as we're freeing the subprog JIT memory. + */ + for (i = 0; i < prog->aux->size_poke_tab; i++) { + map_ptr = prog->aux->poke_tab[i].tail_call.map; + map_ptr->ops->map_poke_untrack(map_ptr, prog->aux); + } + /* At this point we're guaranteed that poke descriptors are not + * live anymore. We can just unlink its descriptor table as it's + * released with the main prog. + */ for (i = 0; i < env->subprog_cnt; i++) { if (!func[i]) continue; - - for (j = 0; j < func[i]->aux->size_poke_tab; j++) { - map_ptr = func[i]->aux->poke_tab[j].tail_call.map; - map_ptr->ops->map_poke_untrack(map_ptr, func[i]->aux); - } + func[i]->aux->poke_tab = NULL; bpf_jit_free(func[i]); } kfree(func); -- cgit From a5de4be0aaaa66a2fa98e8a33bdbed3bd0682804 Mon Sep 17 00:00:00 2001 From: Marek BehĂșn Date: Sun, 11 Jul 2021 18:38:15 +0200 Subject: net: phy: marvell10g: fix differentiation of 88X3310 from 88X3340 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It seems that we cannot differentiate 88X3310 from 88X3340 by simply looking at bit 3 of revision ID. This only works on revisions A0 and A1. On revision B0, this bit is always 1. Instead use the 3.d00d register for differentiation, since this register contains information about number of ports on the device. Fixes: 9885d016ffa9 ("net: phy: marvell10g: add separate structure for 88X3340") Signed-off-by: Marek BehĂșn Reported-by: Matteo Croce Tested-by: Matteo Croce Signed-off-by: David S. Miller --- drivers/net/phy/marvell10g.c | 40 +++++++++++++++++++++++++++++++++++----- include/linux/marvell_phy.h | 6 +----- 2 files changed, 36 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c index bbbc6ac8fa82..53a433442803 100644 --- a/drivers/net/phy/marvell10g.c +++ b/drivers/net/phy/marvell10g.c @@ -78,6 +78,11 @@ enum { /* Temperature read register (88E2110 only) */ MV_PCS_TEMP = 0x8042, + /* Number of ports on the device */ + MV_PCS_PORT_INFO = 0xd00d, + MV_PCS_PORT_INFO_NPORTS_MASK = 0x0380, + MV_PCS_PORT_INFO_NPORTS_SHIFT = 7, + /* These registers appear at 0x800X and 0xa00X - the 0xa00X control * registers appear to set themselves to the 0x800X when AN is * restarted, but status registers appear readable from either. @@ -966,6 +971,30 @@ static const struct mv3310_chip mv2111_type = { #endif }; +static int mv3310_get_number_of_ports(struct phy_device *phydev) +{ + int ret; + + ret = phy_read_mmd(phydev, MDIO_MMD_PCS, MV_PCS_PORT_INFO); + if (ret < 0) + return ret; + + ret &= MV_PCS_PORT_INFO_NPORTS_MASK; + ret >>= MV_PCS_PORT_INFO_NPORTS_SHIFT; + + return ret + 1; +} + +static int mv3310_match_phy_device(struct phy_device *phydev) +{ + return mv3310_get_number_of_ports(phydev) == 1; +} + +static int mv3340_match_phy_device(struct phy_device *phydev) +{ + return mv3310_get_number_of_ports(phydev) == 4; +} + static int mv211x_match_phy_device(struct phy_device *phydev, bool has_5g) { int val; @@ -994,7 +1023,8 @@ static int mv2111_match_phy_device(struct phy_device *phydev) static struct phy_driver mv3310_drivers[] = { { .phy_id = MARVELL_PHY_ID_88X3310, - .phy_id_mask = MARVELL_PHY_ID_88X33X0_MASK, + .phy_id_mask = MARVELL_PHY_ID_MASK, + .match_phy_device = mv3310_match_phy_device, .name = "mv88x3310", .driver_data = &mv3310_type, .get_features = mv3310_get_features, @@ -1011,8 +1041,9 @@ static struct phy_driver mv3310_drivers[] = { .set_loopback = genphy_c45_loopback, }, { - .phy_id = MARVELL_PHY_ID_88X3340, - .phy_id_mask = MARVELL_PHY_ID_88X33X0_MASK, + .phy_id = MARVELL_PHY_ID_88X3310, + .phy_id_mask = MARVELL_PHY_ID_MASK, + .match_phy_device = mv3340_match_phy_device, .name = "mv88x3340", .driver_data = &mv3340_type, .get_features = mv3310_get_features, @@ -1069,8 +1100,7 @@ static struct phy_driver mv3310_drivers[] = { module_phy_driver(mv3310_drivers); static struct mdio_device_id __maybe_unused mv3310_tbl[] = { - { MARVELL_PHY_ID_88X3310, MARVELL_PHY_ID_88X33X0_MASK }, - { MARVELL_PHY_ID_88X3340, MARVELL_PHY_ID_88X33X0_MASK }, + { MARVELL_PHY_ID_88X3310, MARVELL_PHY_ID_MASK }, { MARVELL_PHY_ID_88E2110, MARVELL_PHY_ID_MASK }, { }, }; diff --git a/include/linux/marvell_phy.h b/include/linux/marvell_phy.h index acee44b9db26..0f06c2287b52 100644 --- a/include/linux/marvell_phy.h +++ b/include/linux/marvell_phy.h @@ -22,14 +22,10 @@ #define MARVELL_PHY_ID_88E1545 0x01410ea0 #define MARVELL_PHY_ID_88E1548P 0x01410ec0 #define MARVELL_PHY_ID_88E3016 0x01410e60 +#define MARVELL_PHY_ID_88X3310 0x002b09a0 #define MARVELL_PHY_ID_88E2110 0x002b09b0 #define MARVELL_PHY_ID_88X2222 0x01410f10 -/* PHY IDs and mask for Alaska 10G PHYs */ -#define MARVELL_PHY_ID_88X33X0_MASK 0xfffffff8 -#define MARVELL_PHY_ID_88X3310 0x002b09a0 -#define MARVELL_PHY_ID_88X3340 0x002b09a8 - /* Marvel 88E1111 in Finisar SFP module with modified PHY ID */ #define MARVELL_PHY_ID_88E1111_FINISAR 0x01ff0cc0 -- cgit From 0238bcf80e972f2ce25d767e54f89a9e49773f6e Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Mon, 5 Jul 2021 22:42:47 +0300 Subject: ASoC: ti: davinci-mcasp: Add support for the OMAP4 version of McASP There is a single McASP on OMAP4 (and OMAP5) which is configured to only support DIT playback mode on a single serializer. Add 0x200 offset to DAT port address as the TRM suggests it. Signed-off-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20210705194249.2385-4-peter.ujfalusi@gmail.com Signed-off-by: Mark Brown --- include/linux/platform_data/davinci_asp.h | 1 + sound/soc/ti/Kconfig | 1 + sound/soc/ti/davinci-mcasp.c | 26 +++++++++++++++++++++++--- 3 files changed, 25 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/platform_data/davinci_asp.h b/include/linux/platform_data/davinci_asp.h index 5d1fb0d78a22..76b13ef67562 100644 --- a/include/linux/platform_data/davinci_asp.h +++ b/include/linux/platform_data/davinci_asp.h @@ -96,6 +96,7 @@ enum { MCASP_VERSION_2, /* DA8xx/OMAPL1x */ MCASP_VERSION_3, /* TI81xx/AM33xx */ MCASP_VERSION_4, /* DRA7xxx */ + MCASP_VERSION_OMAP, /* OMAP4/5 */ }; enum mcbsp_clk_input_pin { diff --git a/sound/soc/ti/Kconfig b/sound/soc/ti/Kconfig index 698d7bc84dcf..1d9fe3fca193 100644 --- a/sound/soc/ti/Kconfig +++ b/sound/soc/ti/Kconfig @@ -35,6 +35,7 @@ config SND_SOC_DAVINCI_MCASP various Texas Instruments SoCs like: - daVinci devices - Sitara line of SoCs (AM335x, AM438x, etc) + - OMAP4 - DRA7x devices - Keystone devices - K3 devices (am654, j721e) diff --git a/sound/soc/ti/davinci-mcasp.c b/sound/soc/ti/davinci-mcasp.c index 64ec6d485834..56a19eeec5c7 100644 --- a/sound/soc/ti/davinci-mcasp.c +++ b/sound/soc/ti/davinci-mcasp.c @@ -1794,6 +1794,12 @@ static struct davinci_mcasp_pdata dra7_mcasp_pdata = { .version = MCASP_VERSION_4, }; +static struct davinci_mcasp_pdata omap_mcasp_pdata = { + .tx_dma_offset = 0x200, + .rx_dma_offset = 0, + .version = MCASP_VERSION_OMAP, +}; + static const struct of_device_id mcasp_dt_ids[] = { { .compatible = "ti,dm646x-mcasp-audio", @@ -1811,6 +1817,10 @@ static const struct of_device_id mcasp_dt_ids[] = { .compatible = "ti,dra7-mcasp-audio", .data = &dra7_mcasp_pdata, }, + { + .compatible = "ti,omap4-mcasp-audio", + .data = &omap_mcasp_pdata, + }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, mcasp_dt_ids); @@ -2350,10 +2360,17 @@ static int davinci_mcasp_probe(struct platform_device *pdev) dma_data = &mcasp->dma_data[SNDRV_PCM_STREAM_PLAYBACK]; dma_data->filter_data = "tx"; - if (dat) + if (dat) { dma_data->addr = dat->start; - else + /* + * According to the TRM there should be 0x200 offset added to + * the DAT port address + */ + if (mcasp->version == MCASP_VERSION_OMAP) + dma_data->addr += davinci_mcasp_txdma_offset(mcasp->pdata); + } else { dma_data->addr = mem->start + davinci_mcasp_txdma_offset(mcasp->pdata); + } /* RX is not valid in DIT mode */ @@ -2418,7 +2435,10 @@ static int davinci_mcasp_probe(struct platform_device *pdev) ret = edma_pcm_platform_register(&pdev->dev); break; case PCM_SDMA: - ret = sdma_pcm_platform_register(&pdev->dev, "tx", "rx"); + if (mcasp->op_mode == DAVINCI_MCASP_IIS_MODE) + ret = sdma_pcm_platform_register(&pdev->dev, "tx", "rx"); + else + ret = sdma_pcm_platform_register(&pdev->dev, "tx", NULL); break; case PCM_UDMA: ret = udma_pcm_platform_register(&pdev->dev); -- cgit From 79789db03fdd77510cfb35cb4b3bd52b6c50c901 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 12 Jul 2021 16:32:07 +0100 Subject: mm: Make copy_huge_page() always available Rewrite copy_huge_page() and move it into mm/util.c so it's always available. Fixes an exposure of uninitialised memory on configurations with HUGETLB and UFFD enabled and MIGRATION disabled. Fixes: 8cc5fcbb5be8 ("mm, hugetlb: fix racy resv_huge_pages underflow on UFFDIO_COPY") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Kravetz Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 5 ----- include/linux/mm.h | 1 + mm/migrate.c | 48 ------------------------------------------------ mm/util.c | 10 ++++++++++ 4 files changed, 11 insertions(+), 53 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 9b7b7cd3bae9..23dadf7aeba8 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -51,7 +51,6 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page); extern int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, int extra_count); -extern void copy_huge_page(struct page *dst, struct page *src); #else static inline void putback_movable_pages(struct list_head *l) {} @@ -77,10 +76,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, { return -ENOSYS; } - -static inline void copy_huge_page(struct page *dst, struct page *src) -{ -} #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_COMPACTION diff --git a/include/linux/mm.h b/include/linux/mm.h index 57453dba41b9..7ca22e6e694a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -906,6 +906,7 @@ void __put_page(struct page *page); void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); +void copy_huge_page(struct page *dst, struct page *src); /* * Compound pages have a destructor function. Provide a diff --git a/mm/migrate.c b/mm/migrate.c index 23cbd9de030b..34a9ad3e0a4f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -536,54 +536,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, return MIGRATEPAGE_SUCCESS; } -/* - * Gigantic pages are so large that we do not guarantee that page++ pointer - * arithmetic will work across the entire page. We need something more - * specialized. - */ -static void __copy_gigantic_page(struct page *dst, struct page *src, - int nr_pages) -{ - int i; - struct page *dst_base = dst; - struct page *src_base = src; - - for (i = 0; i < nr_pages; ) { - cond_resched(); - copy_highpage(dst, src); - - i++; - dst = mem_map_next(dst, dst_base, i); - src = mem_map_next(src, src_base, i); - } -} - -void copy_huge_page(struct page *dst, struct page *src) -{ - int i; - int nr_pages; - - if (PageHuge(src)) { - /* hugetlbfs page */ - struct hstate *h = page_hstate(src); - nr_pages = pages_per_huge_page(h); - - if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) { - __copy_gigantic_page(dst, src, nr_pages); - return; - } - } else { - /* thp page */ - BUG_ON(!PageTransHuge(src)); - nr_pages = thp_nr_pages(src); - } - - for (i = 0; i < nr_pages; i++) { - cond_resched(); - copy_highpage(dst + i, src + i); - } -} - /* * Copy the page to its new location */ diff --git a/mm/util.c b/mm/util.c index 99c6cc77de9e..9043d03750a7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -731,6 +731,16 @@ int __page_mapcount(struct page *page) } EXPORT_SYMBOL_GPL(__page_mapcount); +void copy_huge_page(struct page *dst, struct page *src) +{ + unsigned i, nr = compound_nr(src); + + for (i = 0; i < nr; i++) { + cond_resched(); + copy_highpage(nth_page(dst, i), nth_page(src, i)); + } +} + int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; int sysctl_overcommit_ratio __read_mostly = 50; unsigned long sysctl_overcommit_kbytes __read_mostly; -- cgit From 52f83955aaf91b22f46765b007b4404ce85b2133 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 12 Jul 2021 14:08:01 +0100 Subject: firmware: arm_scmi: Fix kernel doc warnings Kernel doc validation script is unhappy and complains with the below set of warnings. | Function parameter or member 'fast_switch_possible' not described in 'scmi_perf_proto_ops' | Function parameter or member 'power_scale_mw_get' not described in 'scmi_perf_proto_ops' | cannot understand function prototype: 'struct scmi_sensor_reading ' | cannot understand function prototype: 'struct scmi_range_attrs ' | cannot understand function prototype: 'struct scmi_sensor_axis_info ' | cannot understand function prototype: 'struct scmi_sensor_intervals_info ' Fix them adding appropriate documents or missing keywords. Link: https://lore.kernel.org/r/20210712130801.2436492-2-sudeep.holla@arm.com Reviewed-by: Cristian Marussi Signed-off-by: Sudeep Holla --- include/linux/scmi_protocol.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index 79d0a1237e6c..80e781c51ddc 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -101,6 +101,10 @@ struct scmi_clk_proto_ops { * to sustained performance level mapping * @est_power_get: gets the estimated power cost for a given performance domain * at a given frequency + * @fast_switch_possible: indicates if fast DVFS switching is possible or not + * for a given device + * @power_scale_mw_get: indicates if the power values provided are in milliWatts + * or in some other (abstract) scale */ struct scmi_perf_proto_ops { int (*limits_set)(const struct scmi_protocol_handle *ph, u32 domain, @@ -153,7 +157,7 @@ struct scmi_power_proto_ops { }; /** - * scmi_sensor_reading - represent a timestamped read + * struct scmi_sensor_reading - represent a timestamped read * * Used by @reading_get_timestamped method. * @@ -167,7 +171,7 @@ struct scmi_sensor_reading { }; /** - * scmi_range_attrs - specifies a sensor or axis values' range + * struct scmi_range_attrs - specifies a sensor or axis values' range * @min_range: The minimum value which can be represented by the sensor/axis. * @max_range: The maximum value which can be represented by the sensor/axis. */ @@ -177,7 +181,7 @@ struct scmi_range_attrs { }; /** - * scmi_sensor_axis_info - describes one sensor axes + * struct scmi_sensor_axis_info - describes one sensor axes * @id: The axes ID. * @type: Axes type. Chosen amongst one of @enum scmi_sensor_class. * @scale: Power-of-10 multiplier applied to the axis unit. @@ -205,8 +209,8 @@ struct scmi_sensor_axis_info { }; /** - * scmi_sensor_intervals_info - describes number and type of available update - * intervals + * struct scmi_sensor_intervals_info - describes number and type of available + * update intervals * @segmented: Flag for segmented intervals' representation. When True there * will be exactly 3 intervals in @desc, with each entry * representing a member of a segment in this order: -- cgit From 5ff6319d46cee22c9cd6f39a377e32c444f9a7b0 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 12 Jul 2021 11:27:48 +0100 Subject: firmware: arm_scpi: Fix kernel doc warnings Kernel doc validation script is unhappy and complains with the below set of warnings. | Function parameter or member 'device_domain_id' not described in 'scpi_ops' | Function parameter or member 'get_transition_latency' not described in 'scpi_ops' | Function parameter or member 'add_opps_to_device' not described in 'scpi_ops' | Function parameter or member 'sensor_get_capability' not described in 'scpi_ops' | Function parameter or member 'sensor_get_info' not described in 'scpi_ops' | Function parameter or member 'sensor_get_value' not described in 'scpi_ops' | Function parameter or member 'device_get_power_state' not described in 'scpi_ops' | Function parameter or member 'device_set_power_state' not described in 'scpi_ops' Fix them adding appropriate documents or missing keywords. Link: https://lore.kernel.org/r/20210712130801.2436492-1-sudeep.holla@arm.com Reviewed-by: Cristian Marussi Signed-off-by: Sudeep Holla --- include/linux/scpi_protocol.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/scpi_protocol.h b/include/linux/scpi_protocol.h index afbf8037d8db..d2176a56828a 100644 --- a/include/linux/scpi_protocol.h +++ b/include/linux/scpi_protocol.h @@ -51,6 +51,14 @@ struct scpi_sensor_info { * OPP is an index to the list return by @dvfs_get_info * @dvfs_get_info: returns the DVFS capabilities of the given power * domain. It includes the OPP list and the latency information + * @device_domain_id: gets the scpi domain id for a given device + * @get_transition_latency: gets the DVFS transition latency for a given device + * @add_opps_to_device: adds all the OPPs for a given device + * @sensor_get_capability: get the list of capabilities for the sensors + * @sensor_get_info: get the information of the specified sensor + * @sensor_get_value: gets the current value of the sensor + * @device_get_power_state: gets the power state of a power domain + * @device_set_power_state: sets the power state of a power domain */ struct scpi_ops { u32 (*get_version)(void); -- cgit From d1d488d813703618f0dd93f0e4c4a05928114aa8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 14 Jul 2021 15:47:50 +0200 Subject: fs: add vfs_parse_fs_param_source() helper Add a simple helper that filesystems can use in their parameter parser to parse the "source" parameter. A few places open-coded this function and that already caused a bug in the cgroup v1 parser that we fixed. Let's make it harder to get this wrong by introducing a helper which performs all necessary checks. Link: https://syzkaller.appspot.com/bug?id=6312526aba5beae046fdae8f00399f87aab48b12 Cc: Christoph Hellwig Cc: Alexander Viro Cc: Dmitry Vyukov Signed-off-by: Christian Brauner Signed-off-by: Linus Torvalds --- fs/fs_context.c | 54 ++++++++++++++++++++++++++++++---------------- include/linux/fs_context.h | 2 ++ kernel/cgroup/cgroup-v1.c | 14 +++++------- 3 files changed, 43 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/fs/fs_context.c b/fs/fs_context.c index 2834d1afa6e8..de1985eae535 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -79,6 +79,35 @@ static int vfs_parse_sb_flag(struct fs_context *fc, const char *key) return -ENOPARAM; } +/** + * vfs_parse_fs_param_source - Handle setting "source" via parameter + * @fc: The filesystem context to modify + * @param: The parameter + * + * This is a simple helper for filesystems to verify that the "source" they + * accept is sane. + * + * Returns 0 on success, -ENOPARAM if this is not "source" parameter, and + * -EINVAL otherwise. In the event of failure, supplementary error information + * is logged. + */ +int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param) +{ + if (strcmp(param->key, "source") != 0) + return -ENOPARAM; + + if (param->type != fs_value_is_string) + return invalf(fc, "Non-string source"); + + if (fc->source) + return invalf(fc, "Multiple sources"); + + fc->source = param->string; + param->string = NULL; + return 0; +} +EXPORT_SYMBOL(vfs_parse_fs_param_source); + /** * vfs_parse_fs_param - Add a single parameter to a superblock config * @fc: The filesystem context to modify @@ -122,15 +151,9 @@ int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param) /* If the filesystem doesn't take any arguments, give it the * default handling of source. */ - if (strcmp(param->key, "source") == 0) { - if (param->type != fs_value_is_string) - return invalf(fc, "VFS: Non-string source"); - if (fc->source) - return invalf(fc, "VFS: Multiple sources"); - fc->source = param->string; - param->string = NULL; - return 0; - } + ret = vfs_parse_fs_param_source(fc, param); + if (ret != -ENOPARAM) + return ret; return invalf(fc, "%s: Unknown parameter '%s'", fc->fs_type->name, param->key); @@ -504,16 +527,11 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) struct legacy_fs_context *ctx = fc->fs_private; unsigned int size = ctx->data_size; size_t len = 0; + int ret; - if (strcmp(param->key, "source") == 0) { - if (param->type != fs_value_is_string) - return invalf(fc, "VFS: Legacy: Non-string source"); - if (fc->source) - return invalf(fc, "VFS: Legacy: Multiple sources"); - fc->source = param->string; - param->string = NULL; - return 0; - } + ret = vfs_parse_fs_param_source(fc, param); + if (ret != -ENOPARAM) + return ret; if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS) return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options"); diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index 37e1e8f7f08d..e2bc16300c82 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -139,6 +139,8 @@ extern int vfs_parse_fs_string(struct fs_context *fc, const char *key, extern int generic_parse_monolithic(struct fs_context *fc, void *data); extern int vfs_get_tree(struct fs_context *fc); extern void put_fs_context(struct fs_context *fc); +extern int vfs_parse_fs_param_source(struct fs_context *fc, + struct fs_parameter *param); /* * sget() wrappers to be called from the ->get_tree() op. diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 527917c0b30b..8d6bf56ed77a 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -911,15 +911,11 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) opt = fs_parse(fc, cgroup1_fs_parameters, param, &result); if (opt == -ENOPARAM) { - if (strcmp(param->key, "source") == 0) { - if (param->type != fs_value_is_string) - return invalf(fc, "Non-string source"); - if (fc->source) - return invalf(fc, "Multiple sources not supported"); - fc->source = param->string; - param->string = NULL; - return 0; - } + int ret; + + ret = vfs_parse_fs_param_source(fc, param); + if (ret != -ENOPARAM) + return ret; for_each_subsys(ss, i) { if (strcmp(param->key, ss->legacy_name)) continue; -- cgit From 2db710cc846d3321a4dc0977fa13769bddba2351 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 14 Jul 2021 21:26:40 -0700 Subject: kasan: fix build by including kernel.h The header relies on _RET_IP_ being defined, and had been receiving that definition via inclusion of bug.h which includes kernel.h. However, since f39650de687e ("kernel.h: split out panic and oops helpers") that is no longer the case and get the following build error when building CONFIG_KASAN_HW_TAGS on arm64: In file included from arch/arm64/mm/kasan_init.c:10: include/linux/kasan.h: In function 'kasan_slab_free': include/linux/kasan.h:230:39: error: '_RET_IP_' undeclared (first use in this function) 230 | return __kasan_slab_free(s, object, _RET_IP_, init); Fix it by including kernel.h from kasan.h. Link: https://lkml.kernel.org/r/20210705072716.2125074-1-elver@google.com Fixes: f39650de687e ("kernel.h: split out panic and oops helpers") Signed-off-by: Marco Elver Reviewed-by: Andy Shevchenko Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Peter Collingbourne Cc: Catalin Marinas Cc: Vincenzo Frascino Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 5310e217bd74..dd874a1ee862 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -3,6 +3,7 @@ #define _LINUX_KASAN_H #include +#include #include #include -- cgit From ab7965de1725cd8514f0edbced5c2fb793846078 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 14 Jul 2021 21:26:55 -0700 Subject: mm: fix the try_to_unmap prototype for !CONFIG_MMU Adjust the nommu stub of try_to_unmap to match the changed protype for the full version. Turn it into an inline instead of a macro to generally improve the type checking. Link: https://lkml.kernel.org/r/20210705053944.885828-1-hch@lst.de Fixes: 1fb08ac63bee ("mm: rmap: make try_to_unmap() void function") Signed-off-by: Christoph Hellwig Reviewed-by: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 83fb86133fe1..c976cc6de257 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -291,7 +291,9 @@ static inline int page_referenced(struct page *page, int is_locked, return 0; } -#define try_to_unmap(page, refs) false +static inline void try_to_unmap(struct page *page, enum ttu_flags flags) +{ +} static inline int page_mkclean(struct page *page) { -- cgit From e48bf29cf9d6d60d810e2af71e54b71a324094e0 Mon Sep 17 00:00:00 2001 From: Ye Xiang Date: Sun, 13 Jun 2021 11:25:07 +0800 Subject: HID: intel-ish-hid: use async resume function ISH IPC driver uses asynchronous workqueue to do resume now, but there is a potential timing issue: when child devices resume before bus driver, it will cause child devices resume failed and cannot be recovered until reboot. The current implementation in this case do wait for IPC to resume but fail to accommodate for a case when there is no ISH reboot and soft resume is taking time. This issue is apparent on Tiger Lake platform with 5.11.13 kernel when doing suspend to idle then resume(s0ix) test. To resolve this issue, we change ISHTP HID client to use asynchronous resume callback too. In the asynchronous resume callback, it waits for the ISHTP resume done event, and then notify ISHTP HID client link ready. Signed-off-by: Ye Xiang Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- drivers/hid/intel-ish-hid/ishtp-hid-client.c | 15 +++++++++++++- drivers/hid/intel-ish-hid/ishtp-hid.h | 1 + drivers/hid/intel-ish-hid/ishtp/bus.c | 29 +++++++++++++++++++++------- include/linux/intel-ish-client-if.h | 2 ++ 4 files changed, 39 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/hid/intel-ish-hid/ishtp-hid-client.c b/drivers/hid/intel-ish-hid/ishtp-hid-client.c index 6b1fa971b33e..91bf4d01e91a 100644 --- a/drivers/hid/intel-ish-hid/ishtp-hid-client.c +++ b/drivers/hid/intel-ish-hid/ishtp-hid-client.c @@ -784,6 +784,17 @@ static void hid_ishtp_cl_reset_handler(struct work_struct *work) } } +static void hid_ishtp_cl_resume_handler(struct work_struct *work) +{ + struct ishtp_cl_data *client_data = container_of(work, struct ishtp_cl_data, resume_work); + struct ishtp_cl *hid_ishtp_cl = client_data->hid_ishtp_cl; + + if (ishtp_wait_resume(ishtp_get_ishtp_device(hid_ishtp_cl))) { + client_data->suspended = false; + wake_up_interruptible(&client_data->ishtp_resume_wait); + } +} + ishtp_print_log ishtp_hid_print_trace; /** @@ -822,6 +833,8 @@ static int hid_ishtp_cl_probe(struct ishtp_cl_device *cl_device) init_waitqueue_head(&client_data->ishtp_resume_wait); INIT_WORK(&client_data->work, hid_ishtp_cl_reset_handler); + INIT_WORK(&client_data->resume_work, hid_ishtp_cl_resume_handler); + ishtp_hid_print_trace = ishtp_trace_callback(cl_device); @@ -921,7 +934,7 @@ static int hid_ishtp_cl_resume(struct device *device) hid_ishtp_trace(client_data, "%s hid_ishtp_cl %p\n", __func__, hid_ishtp_cl); - client_data->suspended = false; + schedule_work(&client_data->resume_work); return 0; } diff --git a/drivers/hid/intel-ish-hid/ishtp-hid.h b/drivers/hid/intel-ish-hid/ishtp-hid.h index f88443a7d935..6a5cc11aefd8 100644 --- a/drivers/hid/intel-ish-hid/ishtp-hid.h +++ b/drivers/hid/intel-ish-hid/ishtp-hid.h @@ -135,6 +135,7 @@ struct ishtp_cl_data { int multi_packet_cnt; struct work_struct work; + struct work_struct resume_work; struct ishtp_cl_device *cl_device; }; diff --git a/drivers/hid/intel-ish-hid/ishtp/bus.c b/drivers/hid/intel-ish-hid/ishtp/bus.c index f0802b047ed8..aa2c51624012 100644 --- a/drivers/hid/intel-ish-hid/ishtp/bus.c +++ b/drivers/hid/intel-ish-hid/ishtp/bus.c @@ -314,13 +314,6 @@ static int ishtp_cl_device_resume(struct device *dev) if (!device) return 0; - /* - * When ISH needs hard reset, it is done asynchrnously, hence bus - * resume will be called before full ISH resume - */ - if (device->ishtp_dev->resume_flag) - return 0; - driver = to_ishtp_cl_driver(dev->driver); if (driver && driver->driver.pm) { if (driver->driver.pm->resume) @@ -849,6 +842,28 @@ struct device *ishtp_device(struct ishtp_cl_device *device) } EXPORT_SYMBOL(ishtp_device); +/** + * ishtp_wait_resume() - Wait for IPC resume + * + * Wait for IPC resume + * + * Return: resume complete or not + */ +bool ishtp_wait_resume(struct ishtp_device *dev) +{ + /* 50ms to get resume response */ + #define WAIT_FOR_RESUME_ACK_MS 50 + + /* Waiting to get resume response */ + if (dev->resume_flag) + wait_event_interruptible_timeout(dev->resume_wait, + !dev->resume_flag, + msecs_to_jiffies(WAIT_FOR_RESUME_ACK_MS)); + + return (!dev->resume_flag); +} +EXPORT_SYMBOL_GPL(ishtp_wait_resume); + /** * ishtp_get_pci_device() - Return PCI device dev pointer * This interface is used to return PCI device pointer diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h index 25e2b4e80502..aee8ff4739b1 100644 --- a/include/linux/intel-ish-client-if.h +++ b/include/linux/intel-ish-client-if.h @@ -81,6 +81,8 @@ int ishtp_register_event_cb(struct ishtp_cl_device *device, /* Get the device * from ishtp device instance */ struct device *ishtp_device(struct ishtp_cl_device *cl_device); +/* wait for IPC resume */ +bool ishtp_wait_resume(struct ishtp_device *dev); /* Trace interface for clients */ ishtp_print_log ishtp_trace_callback(struct ishtp_cl_device *cl_device); /* Get device pointer of PCI device for DMA acces */ -- cgit From e042aa532c84d18ff13291d00620502ce7a38dda Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 16 Jul 2021 09:18:21 +0000 Subject: bpf: Fix pointer arithmetic mask tightening under state pruning In 7fedb63a8307 ("bpf: Tighten speculative pointer arithmetic mask") we narrowed the offset mask for unprivileged pointer arithmetic in order to mitigate a corner case where in the speculative domain it is possible to advance, for example, the map value pointer by up to value_size-1 out-of- bounds in order to leak kernel memory via side-channel to user space. The verifier's state pruning for scalars leaves one corner case open where in the first verification path R_x holds an unknown scalar with an aux->alu_limit of e.g. 7, and in a second verification path that same register R_x, here denoted as R_x', holds an unknown scalar which has tighter bounds and would thus satisfy range_within(R_x, R_x') as well as tnum_in(R_x, R_x') for state pruning, yielding an aux->alu_limit of 3: Given the second path fits the register constraints for pruning, the final generated mask from aux->alu_limit will remain at 7. While technically not wrong for the non-speculative domain, it would however be possible to craft similar cases where the mask would be too wide as in 7fedb63a8307. One way to fix it is to detect the presence of unknown scalar map pointer arithmetic and force a deeper search on unknown scalars to ensure that we do not run into a masking mismatch. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 27 +++++++++++++++++---------- 2 files changed, 18 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e774ecc1cd1f..7ba7e800d472 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -414,6 +414,7 @@ struct bpf_verifier_env { u32 used_map_cnt; /* number of used maps */ u32 used_btf_cnt; /* number of used BTF objects */ u32 id_gen; /* used to generate unique reg IDs */ + bool explore_alu_limits; bool allow_ptr_leaks; bool allow_uninit_stack; bool allow_ptr_to_map_access; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8a7a28b4cfb9..657062cb4d85 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6561,6 +6561,12 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0; alu_state |= ptr_is_dst_reg ? BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; + + /* Limit pruning on unknown scalars to enable deep search for + * potential masking differences from other program paths. + */ + if (!off_is_imm) + env->explore_alu_limits = true; } err = update_alu_sanitation_state(aux, alu_state, alu_limit); @@ -9936,8 +9942,8 @@ next: } /* Returns true if (rold safe implies rcur safe) */ -static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, - struct bpf_id_pair *idmap) +static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, + struct bpf_reg_state *rcur, struct bpf_id_pair *idmap) { bool equal; @@ -9963,6 +9969,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; switch (rold->type) { case SCALAR_VALUE: + if (env->explore_alu_limits) + return false; if (rcur->type == SCALAR_VALUE) { if (!rold->precise && !rcur->precise) return true; @@ -10053,9 +10061,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; } -static bool stacksafe(struct bpf_func_state *old, - struct bpf_func_state *cur, - struct bpf_id_pair *idmap) +static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old, + struct bpf_func_state *cur, struct bpf_id_pair *idmap) { int i, spi; @@ -10100,9 +10107,8 @@ static bool stacksafe(struct bpf_func_state *old, continue; if (old->stack[spi].slot_type[0] != STACK_SPILL) continue; - if (!regsafe(&old->stack[spi].spilled_ptr, - &cur->stack[spi].spilled_ptr, - idmap)) + if (!regsafe(env, &old->stack[spi].spilled_ptr, + &cur->stack[spi].spilled_ptr, idmap)) /* when explored and current stack slot are both storing * spilled registers, check that stored pointers types * are the same as well. @@ -10159,10 +10165,11 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); for (i = 0; i < MAX_BPF_REG; i++) - if (!regsafe(&old->regs[i], &cur->regs[i], env->idmap_scratch)) + if (!regsafe(env, &old->regs[i], &cur->regs[i], + env->idmap_scratch)) return false; - if (!stacksafe(old, cur, env->idmap_scratch)) + if (!stacksafe(env, old, cur, env->idmap_scratch)) return false; if (!refsafe(old, cur)) -- cgit From ec645dc96699ea6c37b6de86c84d7288ea9a4ddf Mon Sep 17 00:00:00 2001 From: Oleksandr Natalenko Date: Sat, 17 Jul 2021 14:33:28 +0200 Subject: block: increase BLKCG_MAX_POLS After mq-deadline learned to deal with cgroups, the BLKCG_MAX_POLS value became too small for all the elevators to be registered properly. The following issue is seen: ``` calling bfq_init+0x0/0x8b @ 1 blkcg_policy_register: BLKCG_MAX_POLS too small initcall bfq_init+0x0/0x8b returned -28 after 507 usecs ``` which renders BFQ non-functional. Increase BLKCG_MAX_POLS to allow enough space for everyone. Fixes: 08a9ad8bf607 ("block/mq-deadline: Add cgroup support") Link: https://lore.kernel.org/lkml/8988303.mDXGIdCtx8@natalenko.name/ Signed-off-by: Oleksandr Natalenko Link: https://lore.kernel.org/r/20210717123328.945810-1-oleksandr@natalenko.name Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c454fb446fd0..2e12320cb121 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -57,7 +57,7 @@ struct blk_keyslot_manager; * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. */ -#define BLKCG_MAX_POLS 5 +#define BLKCG_MAX_POLS 6 typedef void (rq_end_io_fn)(struct request *, blk_status_t); -- cgit From d6371c76e20d7d3f61b05fd67b596af4d14a8886 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Mon, 19 Jul 2021 09:51:34 +0100 Subject: bpf: Fix OOB read when printing XDP link fdinfo We got the following UBSAN report on one of our testing machines: ================================================================================ UBSAN: array-index-out-of-bounds in kernel/bpf/syscall.c:2389:24 index 6 is out of range for type 'char *[6]' CPU: 43 PID: 930921 Comm: systemd-coredum Tainted: G O 5.10.48-cloudflare-kasan-2021.7.0 #1 Hardware name: Call Trace: dump_stack+0x7d/0xa3 ubsan_epilogue+0x5/0x40 __ubsan_handle_out_of_bounds.cold+0x43/0x48 ? seq_printf+0x17d/0x250 bpf_link_show_fdinfo+0x329/0x380 ? bpf_map_value_size+0xe0/0xe0 ? put_files_struct+0x20/0x2d0 ? __kasan_kmalloc.constprop.0+0xc2/0xd0 seq_show+0x3f7/0x540 seq_read_iter+0x3f8/0x1040 seq_read+0x329/0x500 ? seq_read_iter+0x1040/0x1040 ? __fsnotify_parent+0x80/0x820 ? __fsnotify_update_child_dentry_flags+0x380/0x380 vfs_read+0x123/0x460 ksys_read+0xed/0x1c0 ? __x64_sys_pwrite64+0x1f0/0x1f0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 ================================================================================ ================================================================================ UBSAN: object-size-mismatch in kernel/bpf/syscall.c:2384:2 From the report, we can infer that some array access in bpf_link_show_fdinfo at index 6 is out of bounds. The obvious candidate is bpf_link_type_strs[BPF_LINK_TYPE_XDP] with BPF_LINK_TYPE_XDP == 6. It turns out that BPF_LINK_TYPE_XDP is missing from bpf_types.h and therefore doesn't have an entry in bpf_link_type_strs: pos: 0 flags: 02000000 mnt_id: 13 link_type: (null) link_id: 4 prog_tag: bcf7977d3b93787c prog_id: 4 ifindex: 1 Fixes: aa8d3a716b59 ("bpf, xdp: Add bpf_link-based XDP attachment API") Signed-off-by: Lorenz Bauer Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210719085134.43325-2-lmb@cloudflare.com --- include/linux/bpf_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index a9db1eae6796..ae3ac3a2018c 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -134,4 +134,5 @@ BPF_LINK_TYPE(BPF_LINK_TYPE_CGROUP, cgroup) BPF_LINK_TYPE(BPF_LINK_TYPE_ITER, iter) #ifdef CONFIG_NET BPF_LINK_TYPE(BPF_LINK_TYPE_NETNS, netns) +BPF_LINK_TYPE(BPF_LINK_TYPE_XDP, xdp) #endif -- cgit From dc7019b7d0e188d4093b34bd0747ed0d668c63bf Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Mon, 14 Jun 2021 17:33:14 -0500 Subject: tee: add tee_shm_alloc_kernel_buf() Adds a new function tee_shm_alloc_kernel_buf() to allocate shared memory from a kernel driver. This function can later be made more lightweight by unnecessary dma-buf export. Cc: stable@vger.kernel.org Reviewed-by: Tyler Hicks Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- drivers/tee/tee_shm.c | 18 ++++++++++++++++++ include/linux/tee_drv.h | 1 + 2 files changed, 19 insertions(+) (limited to 'include/linux') diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index 00472f5ce22e..c65e44707cd6 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -193,6 +193,24 @@ err_dev_put: } EXPORT_SYMBOL_GPL(tee_shm_alloc); +/** + * tee_shm_alloc_kernel_buf() - Allocate shared memory for kernel buffer + * @ctx: Context that allocates the shared memory + * @size: Requested size of shared memory + * + * The returned memory registered in secure world and is suitable to be + * passed as a memory buffer in parameter argument to + * tee_client_invoke_func(). The memory allocated is later freed with a + * call to tee_shm_free(). + * + * @returns a pointer to 'struct tee_shm' + */ +struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size) +{ + return tee_shm_alloc(ctx, size, TEE_SHM_MAPPED | TEE_SHM_DMA_BUF); +} +EXPORT_SYMBOL_GPL(tee_shm_alloc_kernel_buf); + struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr, size_t length, u32 flags) { diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 54269e47ac9a..8990f7628387 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -332,6 +332,7 @@ void *tee_get_drvdata(struct tee_device *teedev); * @returns a pointer to 'struct tee_shm' */ struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags); +struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size); /** * tee_shm_register() - Register shared memory buffer -- cgit From 376e4199e327a5cf29b8ec8fb0f64f3d8b429819 Mon Sep 17 00:00:00 2001 From: Sumit Garg Date: Mon, 14 Jun 2021 17:33:15 -0500 Subject: tee: Correct inappropriate usage of TEE_SHM_DMA_BUF flag Currently TEE_SHM_DMA_BUF flag has been inappropriately used to not register shared memory allocated for private usage by underlying TEE driver: OP-TEE in this case. So rather add a new flag as TEE_SHM_PRIV that can be utilized by underlying TEE drivers for private allocation and usage of shared memory. With this corrected, allow tee_shm_alloc_kernel_buf() to allocate a shared memory region without the backing of dma-buf. Cc: stable@vger.kernel.org Signed-off-by: Sumit Garg Co-developed-by: Tyler Hicks Signed-off-by: Tyler Hicks Reviewed-by: Jens Wiklander Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- drivers/tee/optee/call.c | 2 +- drivers/tee/optee/core.c | 3 ++- drivers/tee/optee/rpc.c | 5 +++-- drivers/tee/optee/shm_pool.c | 8 ++++++-- drivers/tee/tee_shm.c | 4 ++-- include/linux/tee_drv.h | 1 + 6 files changed, 15 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/tee/optee/call.c b/drivers/tee/optee/call.c index 387e94768182..945f03da0223 100644 --- a/drivers/tee/optee/call.c +++ b/drivers/tee/optee/call.c @@ -184,7 +184,7 @@ static struct tee_shm *get_msg_arg(struct tee_context *ctx, size_t num_params, struct optee_msg_arg *ma; shm = tee_shm_alloc(ctx, OPTEE_MSG_GET_ARG_SIZE(num_params), - TEE_SHM_MAPPED); + TEE_SHM_MAPPED | TEE_SHM_PRIV); if (IS_ERR(shm)) return shm; diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c index 651d49b53d3b..5ce13b099d7d 100644 --- a/drivers/tee/optee/core.c +++ b/drivers/tee/optee/core.c @@ -278,7 +278,8 @@ static void optee_release(struct tee_context *ctx) if (!ctxdata) return; - shm = tee_shm_alloc(ctx, sizeof(struct optee_msg_arg), TEE_SHM_MAPPED); + shm = tee_shm_alloc(ctx, sizeof(struct optee_msg_arg), + TEE_SHM_MAPPED | TEE_SHM_PRIV); if (!IS_ERR(shm)) { arg = tee_shm_get_va(shm, 0); /* diff --git a/drivers/tee/optee/rpc.c b/drivers/tee/optee/rpc.c index 1849180b0278..efbaff7ad7e5 100644 --- a/drivers/tee/optee/rpc.c +++ b/drivers/tee/optee/rpc.c @@ -314,7 +314,7 @@ static void handle_rpc_func_cmd_shm_alloc(struct tee_context *ctx, shm = cmd_alloc_suppl(ctx, sz); break; case OPTEE_RPC_SHM_TYPE_KERNEL: - shm = tee_shm_alloc(ctx, sz, TEE_SHM_MAPPED); + shm = tee_shm_alloc(ctx, sz, TEE_SHM_MAPPED | TEE_SHM_PRIV); break; default: arg->ret = TEEC_ERROR_BAD_PARAMETERS; @@ -502,7 +502,8 @@ void optee_handle_rpc(struct tee_context *ctx, struct optee_rpc_param *param, switch (OPTEE_SMC_RETURN_GET_RPC_FUNC(param->a0)) { case OPTEE_SMC_RPC_FUNC_ALLOC: - shm = tee_shm_alloc(ctx, param->a1, TEE_SHM_MAPPED); + shm = tee_shm_alloc(ctx, param->a1, + TEE_SHM_MAPPED | TEE_SHM_PRIV); if (!IS_ERR(shm) && !tee_shm_get_pa(shm, 0, &pa)) { reg_pair_from_64(¶m->a1, ¶m->a2, pa); reg_pair_from_64(¶m->a4, ¶m->a5, diff --git a/drivers/tee/optee/shm_pool.c b/drivers/tee/optee/shm_pool.c index da06ce9b9313..c41a9a501a6e 100644 --- a/drivers/tee/optee/shm_pool.c +++ b/drivers/tee/optee/shm_pool.c @@ -27,7 +27,11 @@ static int pool_op_alloc(struct tee_shm_pool_mgr *poolm, shm->paddr = page_to_phys(page); shm->size = PAGE_SIZE << order; - if (shm->flags & TEE_SHM_DMA_BUF) { + /* + * Shared memory private to the OP-TEE driver doesn't need + * to be registered with OP-TEE. + */ + if (!(shm->flags & TEE_SHM_PRIV)) { unsigned int nr_pages = 1 << order, i; struct page **pages; @@ -60,7 +64,7 @@ err: static void pool_op_free(struct tee_shm_pool_mgr *poolm, struct tee_shm *shm) { - if (shm->flags & TEE_SHM_DMA_BUF) + if (!(shm->flags & TEE_SHM_PRIV)) optee_shm_unregister(shm->ctx, shm); free_pages((unsigned long)shm->kaddr, get_order(shm->size)); diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index c65e44707cd6..8a9384a64f3e 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -117,7 +117,7 @@ struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags) return ERR_PTR(-EINVAL); } - if ((flags & ~(TEE_SHM_MAPPED | TEE_SHM_DMA_BUF))) { + if ((flags & ~(TEE_SHM_MAPPED | TEE_SHM_DMA_BUF | TEE_SHM_PRIV))) { dev_err(teedev->dev.parent, "invalid shm flags 0x%x", flags); return ERR_PTR(-EINVAL); } @@ -207,7 +207,7 @@ EXPORT_SYMBOL_GPL(tee_shm_alloc); */ struct tee_shm *tee_shm_alloc_kernel_buf(struct tee_context *ctx, size_t size) { - return tee_shm_alloc(ctx, size, TEE_SHM_MAPPED | TEE_SHM_DMA_BUF); + return tee_shm_alloc(ctx, size, TEE_SHM_MAPPED); } EXPORT_SYMBOL_GPL(tee_shm_alloc_kernel_buf); diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h index 8990f7628387..3ebfea0781f1 100644 --- a/include/linux/tee_drv.h +++ b/include/linux/tee_drv.h @@ -27,6 +27,7 @@ #define TEE_SHM_USER_MAPPED BIT(4) /* Memory mapped in user space */ #define TEE_SHM_POOL BIT(5) /* Memory allocated from pool */ #define TEE_SHM_KERNEL_MAPPED BIT(6) /* Memory mapped in kernel space */ +#define TEE_SHM_PRIV BIT(7) /* Memory private to TEE driver */ struct device; struct tee_device; -- cgit From d8a719059b9dc963aa190598778ac804ff3e6a87 Mon Sep 17 00:00:00 2001 From: Jonathan Marek Date: Wed, 21 Jul 2021 17:02:13 +1000 Subject: Revert "mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge" This reverts commit c742199a014de23ee92055c2473d91fe5561ffdf. c742199a014d ("mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge") breaks arm64 in at least two ways for configurations where PUD or PMD folding occur: 1. We no longer install huge-vmap mappings and silently fall back to page-granular entries, despite being able to install block entries at what is effectively the PGD level. 2. If the linear map is backed with block mappings, these will now silently fail to be created in alloc_init_pud(), causing a panic early during boot. The pgtable selftests caught this, although a fix has not been forthcoming and Christophe is AWOL at the moment, so just revert the change for now to get a working -rc3 on which we can queue patches for 5.15. A simple revert breaks the build for 32-bit PowerPC 8xx machines, which rely on the default function definitions when the corresponding page-table levels are folded, since commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge pages on VMAP and VMALLOC"), eg: powerpc64-linux-ld: mm/vmalloc.o: in function `vunmap_pud_range': linux/mm/vmalloc.c:362: undefined reference to `pud_clear_huge' To avoid that, add stubs for pud_clear_huge() and pmd_clear_huge() in arch/powerpc/mm/nohash/8xx.c as suggested by Christophe. Cc: Christophe Leroy Cc: Catalin Marinas Cc: Andrew Morton Cc: Nicholas Piggin Cc: Mike Rapoport Cc: Mark Rutland Cc: Geert Uytterhoeven Fixes: c742199a014d ("mm/pgtable: add stubs for {pmd/pub}_{set/clear}_huge") Signed-off-by: Jonathan Marek Reviewed-by: Ard Biesheuvel Acked-by: Marc Zyngier [mpe: Fold in 8xx.c changes from Christophe and mention in change log] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/linux-arm-kernel/CAMuHMdXShORDox-xxaeUfDW3wx2PeggFSqhVSHVZNKCGK-y_vQ@mail.gmail.com/ Link: https://lore.kernel.org/r/20210717160118.9855-1-jonathan@marek.ca Link: https://lore.kernel.org/r/87r1fs1762.fsf@mpe.ellerman.id.au Signed-off-by: Will Deacon --- arch/arm64/mm/mmu.c | 20 ++++++++------------ arch/powerpc/mm/nohash/8xx.c | 10 ++++++++++ arch/x86/mm/pgtable.c | 34 +++++++++++++++------------------- include/linux/pgtable.h | 26 +------------------------- 4 files changed, 34 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index d74586508448..9ff0de1b2b93 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1339,7 +1339,6 @@ void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot) return dt_virt; } -#if CONFIG_PGTABLE_LEVELS > 3 int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot) { pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot)); @@ -1354,16 +1353,6 @@ int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot) return 1; } -int pud_clear_huge(pud_t *pudp) -{ - if (!pud_sect(READ_ONCE(*pudp))) - return 0; - pud_clear(pudp); - return 1; -} -#endif - -#if CONFIG_PGTABLE_LEVELS > 2 int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot) { pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot)); @@ -1378,6 +1367,14 @@ int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot) return 1; } +int pud_clear_huge(pud_t *pudp) +{ + if (!pud_sect(READ_ONCE(*pudp))) + return 0; + pud_clear(pudp); + return 1; +} + int pmd_clear_huge(pmd_t *pmdp) { if (!pmd_sect(READ_ONCE(*pmdp))) @@ -1385,7 +1382,6 @@ int pmd_clear_huge(pmd_t *pmdp) pmd_clear(pmdp); return 1; } -#endif int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr) { diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index 60780e089118..0df9fe29dd56 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -240,3 +240,13 @@ void __init setup_kuap(bool disabled) mtspr(SPRN_MD_AP, MD_APG_KUAP); } #endif + +int pud_clear_huge(pud_t *pud) +{ + return 0; +} + +int pmd_clear_huge(pmd_t *pmd) +{ + return 0; +} diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3364fe62b903..3481b35cb4ec 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -682,7 +682,6 @@ int p4d_clear_huge(p4d_t *p4d) } #endif -#if CONFIG_PGTABLE_LEVELS > 3 /** * pud_set_huge - setup kernel PUD mapping * @@ -721,23 +720,6 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) return 1; } -/** - * pud_clear_huge - clear kernel PUD mapping when it is set - * - * Returns 1 on success and 0 on failure (no PUD map is found). - */ -int pud_clear_huge(pud_t *pud) -{ - if (pud_large(*pud)) { - pud_clear(pud); - return 1; - } - - return 0; -} -#endif - -#if CONFIG_PGTABLE_LEVELS > 2 /** * pmd_set_huge - setup kernel PMD mapping * @@ -768,6 +750,21 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) return 1; } +/** + * pud_clear_huge - clear kernel PUD mapping when it is set + * + * Returns 1 on success and 0 on failure (no PUD map is found). + */ +int pud_clear_huge(pud_t *pud) +{ + if (pud_large(*pud)) { + pud_clear(pud); + return 1; + } + + return 0; +} + /** * pmd_clear_huge - clear kernel PMD mapping when it is set * @@ -782,7 +779,6 @@ int pmd_clear_huge(pmd_t *pmd) return 0; } -#endif #ifdef CONFIG_X86_64 /** diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index d147480cdefc..e24d2c992b11 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1397,34 +1397,10 @@ static inline int p4d_clear_huge(p4d_t *p4d) } #endif /* !__PAGETABLE_P4D_FOLDED */ -#ifndef __PAGETABLE_PUD_FOLDED int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); -int pud_clear_huge(pud_t *pud); -#else -static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) -{ - return 0; -} -static inline int pud_clear_huge(pud_t *pud) -{ - return 0; -} -#endif /* !__PAGETABLE_PUD_FOLDED */ - -#ifndef __PAGETABLE_PMD_FOLDED int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); +int pud_clear_huge(pud_t *pud); int pmd_clear_huge(pmd_t *pmd); -#else -static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) -{ - return 0; -} -static inline int pmd_clear_huge(pmd_t *pmd) -{ - return 0; -} -#endif /* !__PAGETABLE_PMD_FOLDED */ - int p4d_free_pud_page(p4d_t *p4d, unsigned long addr); int pud_free_pmd_page(pud_t *pud, unsigned long addr); int pmd_free_pte_page(pmd_t *pmd, unsigned long addr); -- cgit From 853a9ae29e978d37f5dfa72622a68c9ae3d7fa89 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 14 Jul 2021 10:04:27 +0200 Subject: serial: 8250: fix handle_irq locking The 8250 handle_irq callback is not just called from the interrupt handler but also from a timer callback when polling (e.g. for ports without an interrupt line). Consequently the callback must explicitly disable interrupts to avoid a potential deadlock with another interrupt in polled mode. Add back an irqrestore-version of the sysrq port-unlock helper and use it in the 8250 callbacks that need it. Fixes: 75f4e830fa9c ("serial: do not restore interrupt state in sysrq helper") Cc: stable@vger.kernel.org # 5.13 Cc: Joel Stanley Cc: Andrew Jeffery Reported-by: kernel test robot Signed-off-by: Johan Hovold Link: https://lore.kernel.org/r/20210714080427.28164-1-johan@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/8250/8250_aspeed_vuart.c | 5 +++-- drivers/tty/serial/8250/8250_fsl.c | 5 +++-- drivers/tty/serial/8250/8250_port.c | 5 +++-- include/linux/serial_core.h | 24 ++++++++++++++++++++++++ 4 files changed, 33 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/drivers/tty/serial/8250/8250_aspeed_vuart.c b/drivers/tty/serial/8250/8250_aspeed_vuart.c index 4caab8714e2c..2350fb3bb5e4 100644 --- a/drivers/tty/serial/8250/8250_aspeed_vuart.c +++ b/drivers/tty/serial/8250/8250_aspeed_vuart.c @@ -329,6 +329,7 @@ static int aspeed_vuart_handle_irq(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); unsigned int iir, lsr; + unsigned long flags; unsigned int space, count; iir = serial_port_in(port, UART_IIR); @@ -336,7 +337,7 @@ static int aspeed_vuart_handle_irq(struct uart_port *port) if (iir & UART_IIR_NO_INT) return 0; - spin_lock(&port->lock); + spin_lock_irqsave(&port->lock, flags); lsr = serial_port_in(port, UART_LSR); @@ -370,7 +371,7 @@ static int aspeed_vuart_handle_irq(struct uart_port *port) if (lsr & UART_LSR_THRE) serial8250_tx_chars(up); - uart_unlock_and_check_sysrq(port); + uart_unlock_and_check_sysrq_irqrestore(port, flags); return 1; } diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c index 4e75d2e4f87c..fc65a2293ce9 100644 --- a/drivers/tty/serial/8250/8250_fsl.c +++ b/drivers/tty/serial/8250/8250_fsl.c @@ -30,10 +30,11 @@ struct fsl8250_data { int fsl8250_handle_irq(struct uart_port *port) { unsigned char lsr, orig_lsr; + unsigned long flags; unsigned int iir; struct uart_8250_port *up = up_to_u8250p(port); - spin_lock(&up->port.lock); + spin_lock_irqsave(&up->port.lock, flags); iir = port->serial_in(port, UART_IIR); if (iir & UART_IIR_NO_INT) { @@ -82,7 +83,7 @@ int fsl8250_handle_irq(struct uart_port *port) up->lsr_saved_flags = orig_lsr; - uart_unlock_and_check_sysrq(&up->port); + uart_unlock_and_check_sysrq_irqrestore(&up->port, flags); return 1; } diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 2e7000f79b03..1da29a219842 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -1899,11 +1899,12 @@ int serial8250_handle_irq(struct uart_port *port, unsigned int iir) unsigned char status; struct uart_8250_port *up = up_to_u8250p(port); bool skip_rx = false; + unsigned long flags; if (iir & UART_IIR_NO_INT) return 0; - spin_lock(&port->lock); + spin_lock_irqsave(&port->lock, flags); status = serial_port_in(port, UART_LSR); @@ -1929,7 +1930,7 @@ int serial8250_handle_irq(struct uart_port *port, unsigned int iir) (up->ier & UART_IER_THRI)) serial8250_tx_chars(up); - uart_unlock_and_check_sysrq(port); + uart_unlock_and_check_sysrq_irqrestore(port, flags); return 1; } diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 52d7fb92a69d..c58cc142d23f 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -518,6 +518,25 @@ static inline void uart_unlock_and_check_sysrq(struct uart_port *port) if (sysrq_ch) handle_sysrq(sysrq_ch); } + +static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port, + unsigned long flags) +{ + int sysrq_ch; + + if (!port->has_sysrq) { + spin_unlock_irqrestore(&port->lock, flags); + return; + } + + sysrq_ch = port->sysrq_ch; + port->sysrq_ch = 0; + + spin_unlock_irqrestore(&port->lock, flags); + + if (sysrq_ch) + handle_sysrq(sysrq_ch); +} #else /* CONFIG_MAGIC_SYSRQ_SERIAL */ static inline int uart_handle_sysrq_char(struct uart_port *port, unsigned int ch) { @@ -531,6 +550,11 @@ static inline void uart_unlock_and_check_sysrq(struct uart_port *port) { spin_unlock(&port->lock); } +static inline void uart_unlock_and_check_sysrq_irqrestore(struct uart_port *port, + unsigned long flags) +{ + spin_unlock_irqrestore(&port->lock, flags); +} #endif /* CONFIG_MAGIC_SYSRQ_SERIAL */ /* -- cgit From 1e7107c5ef44431bc1ebbd4c353f1d7c22e5f2ec Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 16 Jun 2021 08:51:57 -0400 Subject: cgroup1: fix leaked context root causing sporadic NULL deref in LTP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Richard reported sporadic (roughly one in 10 or so) null dereferences and other strange behaviour for a set of automated LTP tests. Things like: BUG: kernel NULL pointer dereference, address: 0000000000000008 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 0 PID: 1516 Comm: umount Not tainted 5.10.0-yocto-standard #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-48-gd9c812dda519-prebuilt.qemu.org 04/01/2014 RIP: 0010:kernfs_sop_show_path+0x1b/0x60 ...or these others: RIP: 0010:do_mkdirat+0x6a/0xf0 RIP: 0010:d_alloc_parallel+0x98/0x510 RIP: 0010:do_readlinkat+0x86/0x120 There were other less common instances of some kind of a general scribble but the common theme was mount and cgroup and a dubious dentry triggering the NULL dereference. I was only able to reproduce it under qemu by replicating Richard's setup as closely as possible - I never did get it to happen on bare metal, even while keeping everything else the same. In commit 71d883c37e8d ("cgroup_do_mount(): massage calling conventions") we see this as a part of the overall change: -------------- struct cgroup_subsys *ss; - struct dentry *dentry; [...] - dentry = cgroup_do_mount(&cgroup_fs_type, fc->sb_flags, root, - CGROUP_SUPER_MAGIC, ns); [...] - if (percpu_ref_is_dying(&root->cgrp.self.refcnt)) { - struct super_block *sb = dentry->d_sb; - dput(dentry); + ret = cgroup_do_mount(fc, CGROUP_SUPER_MAGIC, ns); + if (!ret && percpu_ref_is_dying(&root->cgrp.self.refcnt)) { + struct super_block *sb = fc->root->d_sb; + dput(fc->root); deactivate_locked_super(sb); msleep(10); return restart_syscall(); } -------------- In changing from the local "*dentry" variable to using fc->root, we now export/leave that dentry pointer in the file context after doing the dput() in the unlikely "is_dying" case. With LTP doing a crazy amount of back to back mount/unmount [testcases/bin/cgroup_regression_5_1.sh] the unlikely becomes slightly likely and then bad things happen. A fix would be to not leave the stale reference in fc->root as follows: --------------                 dput(fc->root); + fc->root = NULL;                 deactivate_locked_super(sb); -------------- ...but then we are just open-coding a duplicate of fc_drop_locked() so we simply use that instead. Cc: Al Viro Cc: Tejun Heo Cc: Zefan Li Cc: Johannes Weiner Cc: stable@vger.kernel.org # v5.1+ Reported-by: Richard Purdie Fixes: 71d883c37e8d ("cgroup_do_mount(): massage calling conventions") Signed-off-by: Paul Gortmaker Signed-off-by: Tejun Heo --- fs/internal.h | 1 - include/linux/fs_context.h | 1 + kernel/cgroup/cgroup-v1.c | 4 +--- 3 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/fs/internal.h b/fs/internal.h index 3ce8edbaa3ca..82e8eb32ff3d 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -61,7 +61,6 @@ extern void __init chrdev_init(void); */ extern const struct fs_context_operations legacy_fs_context_ops; extern int parse_monolithic_mount_data(struct fs_context *, void *); -extern void fc_drop_locked(struct fs_context *); extern void vfs_clean_context(struct fs_context *fc); extern int finish_clean_context(struct fs_context *fc); diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h index e2bc16300c82..6b54982fc5f3 100644 --- a/include/linux/fs_context.h +++ b/include/linux/fs_context.h @@ -141,6 +141,7 @@ extern int vfs_get_tree(struct fs_context *fc); extern void put_fs_context(struct fs_context *fc); extern int vfs_parse_fs_param_source(struct fs_context *fc, struct fs_parameter *param); +extern void fc_drop_locked(struct fs_context *fc); /* * sget() wrappers to be called from the ->get_tree() op. diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 8d6bf56ed77a..de2c432dee20 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1221,9 +1221,7 @@ int cgroup1_get_tree(struct fs_context *fc) ret = cgroup_do_get_tree(fc); if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) { - struct super_block *sb = fc->root->d_sb; - dput(fc->root); - deactivate_locked_super(sb); + fc_drop_locked(fc); ret = 1; } -- cgit From 8dad53a11f8d94dceb540a5f8f153484f42be84b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Jul 2021 15:50:17 -0700 Subject: mm: call flush_dcache_page() in memcpy_to_page() and memzero_page() memcpy_to_page and memzero_page can write to arbitrary pages, which could be in the page cache or in high memory, so call flush_kernel_dcache_pages to flush the dcache. This is a problem when using these helpers on dcache challeneged architectures. Right now there are just a few users, chances are no one used the PC floppy driver, the aha1542 driver for an ISA SCSI HBA, and a few advanced and optional btrfs and ext4 features on those platforms yet since the conversion. Link: https://lkml.kernel.org/r/20210713055231.137602-2-hch@lst.de Fixes: bb90d4bc7b6a ("mm/highmem: Lift memcpy_[to|from]_page to core") Fixes: 28961998f858 ("iov_iter: lift memzero_page() to highmem.h") Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Cc: Chaitanya Kulkarni Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 8c6e8e996c87..8e7e50a53a12 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -318,6 +318,7 @@ static inline void memcpy_to_page(struct page *page, size_t offset, VM_BUG_ON(offset + len > PAGE_SIZE); memcpy(to + offset, from, len); + flush_dcache_page(page); kunmap_local(to); } @@ -325,6 +326,7 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len) { char *addr = kmap_atomic(page); memset(addr + offset, 0, len); + flush_dcache_page(page); kunmap_atomic(addr); } -- cgit From d9a42b53bdf7b0329dc09a59fc1b092640b6da19 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Jul 2021 15:50:20 -0700 Subject: mm: use kmap_local_page in memzero_page The commit message introducing the global memzero_page explicitly mentions switching to kmap_local_page in the commit log but doesn't actually do that. Link: https://lkml.kernel.org/r/20210713055231.137602-3-hch@lst.de Fixes: 28961998f858 ("iov_iter: lift memzero_page() to highmem.h") Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ira Weiny Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 8e7e50a53a12..d9a606a9fc64 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -324,10 +324,10 @@ static inline void memcpy_to_page(struct page *page, size_t offset, static inline void memzero_page(struct page *page, size_t offset, size_t len) { - char *addr = kmap_atomic(page); + char *addr = kmap_local_page(page); memset(addr + offset, 0, len); flush_dcache_page(page); - kunmap_atomic(addr); + kunmap_local(addr); } #endif /* _LINUX_HIGHMEM_H */ -- cgit From 79e482e9c3ae86e849c701c846592e72baddda5a Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 23 Jul 2021 15:50:26 -0700 Subject: memblock: make for_each_mem_range() traverse MEMBLOCK_HOTPLUG regions Commit b10d6bca8720 ("arch, drivers: replace for_each_membock() with for_each_mem_range()") didn't take into account that when there is movable_node parameter in the kernel command line, for_each_mem_range() would skip ranges marked with MEMBLOCK_HOTPLUG. The page table setup code in POWER uses for_each_mem_range() to create the linear mapping of the physical memory and since the regions marked as MEMORY_HOTPLUG are skipped, they never make it to the linear map. A later access to the memory in those ranges will fail: BUG: Unable to handle kernel data access on write at 0xc000000400000000 Faulting instruction address: 0xc00000000008a3c0 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries Modules linked in: CPU: 0 PID: 53 Comm: kworker/u2:0 Not tainted 5.13.0 #7 NIP: c00000000008a3c0 LR: c0000000003c1ed8 CTR: 0000000000000040 REGS: c000000008a57770 TRAP: 0300 Not tainted (5.13.0) MSR: 8000000002009033 CR: 84222202 XER: 20040000 CFAR: c0000000003c1ed4 DAR: c000000400000000 DSISR: 42000000 IRQMASK: 0 GPR00: c0000000003c1ed8 c000000008a57a10 c0000000019da700 c000000400000000 GPR04: 0000000000000280 0000000000000180 0000000000000400 0000000000000200 GPR08: 0000000000000100 0000000000000080 0000000000000040 0000000000000300 GPR12: 0000000000000380 c000000001bc0000 c0000000001660c8 c000000006337e00 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000040000000 0000000020000000 c000000001a81990 c000000008c30000 GPR24: c000000008c20000 c000000001a81998 000fffffffff0000 c000000001a819a0 GPR28: c000000001a81908 c00c000001000000 c000000008c40000 c000000008a64680 NIP clear_user_page+0x50/0x80 LR __handle_mm_fault+0xc88/0x1910 Call Trace: __handle_mm_fault+0xc44/0x1910 (unreliable) handle_mm_fault+0x130/0x2a0 __get_user_pages+0x248/0x610 __get_user_pages_remote+0x12c/0x3e0 get_arg_page+0x54/0xf0 copy_string_kernel+0x11c/0x210 kernel_execve+0x16c/0x220 call_usermodehelper_exec_async+0x1b0/0x2f0 ret_from_kernel_thread+0x5c/0x70 Instruction dump: 79280fa4 79271764 79261f24 794ae8e2 7ca94214 7d683a14 7c893a14 7d893050 7d4903a6 60000000 60000000 60000000 <7c001fec> 7c091fec 7c081fec 7c051fec ---[ end trace 490b8c67e6075e09 ]--- Making for_each_mem_range() include MEMBLOCK_HOTPLUG regions in the traversal fixes this issue. Link: https://bugzilla.redhat.com/show_bug.cgi?id=1976100 Link: https://lkml.kernel.org/r/20210712071132.20902-1-rppt@kernel.org Fixes: b10d6bca8720 ("arch, drivers: replace for_each_membock() with for_each_mem_range()") Signed-off-by: Mike Rapoport Tested-by: Greg Kurz Reviewed-by: David Hildenbrand Cc: [5.10+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 4 ++-- mm/memblock.c | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index cbf46f56d105..4a53c3ca86bd 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -209,7 +209,7 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, */ #define for_each_mem_range(i, p_start, p_end) \ __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE, \ - MEMBLOCK_NONE, p_start, p_end, NULL) + MEMBLOCK_HOTPLUG, p_start, p_end, NULL) /** * for_each_mem_range_rev - reverse iterate through memblock areas from @@ -220,7 +220,7 @@ static inline void __next_physmem_range(u64 *idx, struct memblock_type *type, */ #define for_each_mem_range_rev(i, p_start, p_end) \ __for_each_mem_range_rev(i, &memblock.memory, NULL, NUMA_NO_NODE, \ - MEMBLOCK_NONE, p_start, p_end, NULL) + MEMBLOCK_HOTPLUG, p_start, p_end, NULL) /** * for_each_reserved_mem_range - iterate over all reserved memblock areas diff --git a/mm/memblock.c b/mm/memblock.c index 0041ff62c584..de7b553baa50 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -947,7 +947,8 @@ static bool should_skip_region(struct memblock_type *type, return true; /* skip hotpluggable memory regions if needed */ - if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) + if (movable_node_is_enabled() && memblock_is_hotpluggable(m) && + !(flags & MEMBLOCK_HOTPLUG)) return true; /* if we want mirror memory skip non-mirror memory regions */ -- cgit From bf88fef0b6f1488abeca594d377991171c00e52a Mon Sep 17 00:00:00 2001 From: Dmitry Osipenko Date: Sat, 17 Jul 2021 21:21:27 +0300 Subject: usb: otg-fsm: Fix hrtimer list corruption The HNP work can be re-scheduled while it's still in-fly. This results in re-initialization of the busy work, resetting the hrtimer's list node of the work and crashing kernel with null dereference within kernel/timer once work's timer is expired. It's very easy to trigger this problem by re-plugging USB cable quickly. Initialize HNP work only once to fix this trouble. Unable to handle kernel NULL pointer dereference at virtual address 00000126) ... PC is at __run_timers.part.0+0x150/0x228 LR is at __next_timer_interrupt+0x51/0x9c ... (__run_timers.part.0) from [] (run_timer_softirq+0x2f/0x50) (run_timer_softirq) from [] (__do_softirq+0xd5/0x2f0) (__do_softirq) from [] (irq_exit+0xab/0xb8) (irq_exit) from [] (handle_domain_irq+0x45/0x60) (handle_domain_irq) from [] (gic_handle_irq+0x6b/0x7c) (gic_handle_irq) from [] (__irq_svc+0x65/0xac) Cc: stable@vger.kernel.org Acked-by: Peter Chen Signed-off-by: Dmitry Osipenko Link: https://lore.kernel.org/r/20210717182134.30262-6-digetx@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/common/usb-otg-fsm.c | 6 +++++- include/linux/usb/otg-fsm.h | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/usb/common/usb-otg-fsm.c b/drivers/usb/common/usb-otg-fsm.c index 3740cf95560e..0697fde51d00 100644 --- a/drivers/usb/common/usb-otg-fsm.c +++ b/drivers/usb/common/usb-otg-fsm.c @@ -193,7 +193,11 @@ static void otg_start_hnp_polling(struct otg_fsm *fsm) if (!fsm->host_req_flag) return; - INIT_DELAYED_WORK(&fsm->hnp_polling_work, otg_hnp_polling_work); + if (!fsm->hnp_work_inited) { + INIT_DELAYED_WORK(&fsm->hnp_polling_work, otg_hnp_polling_work); + fsm->hnp_work_inited = true; + } + schedule_delayed_work(&fsm->hnp_polling_work, msecs_to_jiffies(T_HOST_REQ_POLL)); } diff --git a/include/linux/usb/otg-fsm.h b/include/linux/usb/otg-fsm.h index 3aee78dda16d..784659d4dc99 100644 --- a/include/linux/usb/otg-fsm.h +++ b/include/linux/usb/otg-fsm.h @@ -196,6 +196,7 @@ struct otg_fsm { struct mutex lock; u8 *host_req_flag; struct delayed_work hnp_polling_work; + bool hnp_work_inited; bool state_changed; }; -- cgit From 9635720b7c88592214562cb72605bdab6708006c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 27 Jul 2021 09:05:00 -0700 Subject: bpf, sockmap: Fix memleak on ingress msg enqueue If backlog handler is running during a tear down operation we may enqueue data on the ingress msg queue while tear down is trying to free it. sk_psock_backlog() sk_psock_handle_skb() skb_psock_skb_ingress() sk_psock_skb_ingress_enqueue() sk_psock_queue_msg(psock,msg) spin_lock(ingress_lock) sk_psock_zap_ingress() _sk_psock_purge_ingerss_msg() _sk_psock_purge_ingress_msg() -- free ingress_msg list -- spin_unlock(ingress_lock) spin_lock(ingress_lock) list_add_tail(msg,ingress_msg) <- entry on list with no one left to free it. spin_unlock(ingress_lock) To fix we only enqueue from backlog if the ENABLED bit is set. The tear down logic clears the bit with ingress_lock set so we wont enqueue the msg in the last step. Fixes: 799aa7f98d53 ("skmsg: Avoid lock_sock() in sk_psock_backlog()") Signed-off-by: John Fastabend Signed-off-by: Andrii Nakryiko Acked-by: Jakub Sitnicki Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210727160500.1713554-4-john.fastabend@gmail.com --- include/linux/skmsg.h | 54 +++++++++++++++++++++++++++++++++------------------ net/core/skmsg.c | 6 ------ 2 files changed, 35 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 96f319099744..14ab0c0bc924 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -285,11 +285,45 @@ static inline struct sk_psock *sk_psock(const struct sock *sk) return rcu_dereference_sk_user_data(sk); } +static inline void sk_psock_set_state(struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + set_bit(bit, &psock->state); +} + +static inline void sk_psock_clear_state(struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + clear_bit(bit, &psock->state); +} + +static inline bool sk_psock_test_state(const struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + return test_bit(bit, &psock->state); +} + +static inline void sock_drop(struct sock *sk, struct sk_buff *skb) +{ + sk_drops_add(sk, skb); + kfree_skb(skb); +} + +static inline void drop_sk_msg(struct sk_psock *psock, struct sk_msg *msg) +{ + if (msg->skb) + sock_drop(psock->sk, msg->skb); + kfree(msg); +} + static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { spin_lock_bh(&psock->ingress_lock); - list_add_tail(&msg->list, &psock->ingress_msg); + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + list_add_tail(&msg->list, &psock->ingress_msg); + else + drop_sk_msg(psock, msg); spin_unlock_bh(&psock->ingress_lock); } @@ -406,24 +440,6 @@ static inline void sk_psock_restore_proto(struct sock *sk, psock->psock_update_sk_prot(sk, psock, true); } -static inline void sk_psock_set_state(struct sk_psock *psock, - enum sk_psock_state_bits bit) -{ - set_bit(bit, &psock->state); -} - -static inline void sk_psock_clear_state(struct sk_psock *psock, - enum sk_psock_state_bits bit) -{ - clear_bit(bit, &psock->state); -} - -static inline bool sk_psock_test_state(const struct sk_psock *psock, - enum sk_psock_state_bits bit) -{ - return test_bit(bit, &psock->state); -} - static inline struct sk_psock *sk_psock_get(struct sock *sk) { struct sk_psock *psock; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 036cdb33a94a..2d6249b28928 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -584,12 +584,6 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, return sk_psock_skb_ingress(psock, skb); } -static void sock_drop(struct sock *sk, struct sk_buff *skb) -{ - sk_drops_add(sk, skb); - kfree_skb(skb); -} - static void sk_psock_skb_state(struct sk_psock *psock, struct sk_psock_work_state *state, struct sk_buff *skb, -- cgit From f5e81d1117501546b7be050c5fbafa6efd2c722c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 13 Jul 2021 08:18:31 +0000 Subject: bpf: Introduce BPF nospec instruction for mitigating Spectre v4 In case of JITs, each of the JIT backends compiles the BPF nospec instruction /either/ to a machine instruction which emits a speculation barrier /or/ to /no/ machine instruction in case the underlying architecture is not affected by Speculative Store Bypass or has different mitigations in place already. This covers both x86 and (implicitly) arm64: In case of x86, we use 'lfence' instruction for mitigation. In case of arm64, we rely on the firmware mitigation as controlled via the ssbd kernel parameter. Whenever the mitigation is enabled, it works for all of the kernel code with no need to provide any additional instructions here (hence only comment in arm64 JIT). Other archs can follow as needed. The BPF nospec instruction is specifically targeting Spectre v4 since i) we don't use a serialization barrier for the Spectre v1 case, and ii) mitigation instructions for v1 and v4 might be different on some archs. The BPF nospec is required for a future commit, where the BPF verifier does annotate intermediate BPF programs with speculation barriers. Co-developed-by: Piotr Krysiuk Co-developed-by: Benedict Schlueter Signed-off-by: Daniel Borkmann Signed-off-by: Piotr Krysiuk Signed-off-by: Benedict Schlueter Acked-by: Alexei Starovoitov --- arch/arm/net/bpf_jit_32.c | 3 +++ arch/arm64/net/bpf_jit_comp.c | 13 +++++++++++++ arch/mips/net/ebpf_jit.c | 3 +++ arch/powerpc/net/bpf_jit_comp32.c | 6 ++++++ arch/powerpc/net/bpf_jit_comp64.c | 6 ++++++ arch/riscv/net/bpf_jit_comp32.c | 4 ++++ arch/riscv/net/bpf_jit_comp64.c | 4 ++++ arch/s390/net/bpf_jit_comp.c | 5 +++++ arch/sparc/net/bpf_jit_comp_64.c | 3 +++ arch/x86/net/bpf_jit_comp.c | 7 +++++++ arch/x86/net/bpf_jit_comp32.c | 6 ++++++ include/linux/filter.h | 15 +++++++++++++++ kernel/bpf/core.c | 19 ++++++++++++++++++- kernel/bpf/disasm.c | 16 +++++++++------- 14 files changed, 102 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 897634d0a67c..a951276f0547 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1602,6 +1602,9 @@ exit: rn = arm_bpf_get_reg32(src_lo, tmp2[1], ctx); emit_ldx_r(dst, rn, off, ctx, BPF_SIZE(code)); break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index dccf98a37283..41c23f474ea6 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -823,6 +823,19 @@ emit_cond_jmp: return ret; break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + /* + * Nothing required here. + * + * In case of arm64, we rely on the firmware mitigation of + * Speculative Store Bypass as controlled via the ssbd kernel + * parameter. Whenever the mitigation is enabled, it works + * for all of the kernel code with no need to provide any + * additional instructions. + */ + break; + /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 939dd06764bc..3a73e9375712 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -1355,6 +1355,9 @@ jeq_common: } break; + case BPF_ST | BPF_NOSPEC: /* speculation barrier */ + break; + case BPF_ST | BPF_B | BPF_MEM: case BPF_ST | BPF_H | BPF_MEM: case BPF_ST | BPF_W | BPF_MEM: diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index 34bb1583fc0c..beb12cbc8c29 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -737,6 +737,12 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * } break; + /* + * BPF_ST NOSPEC (speculation barrier) + */ + case BPF_ST | BPF_NOSPEC: + break; + /* * BPF_ST(X) */ diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index de8595880fee..b87a63dba9c8 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -627,6 +627,12 @@ emit_clear: } break; + /* + * BPF_ST NOSPEC (speculation barrier) + */ + case BPF_ST | BPF_NOSPEC: + break; + /* * BPF_ST(X) */ diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c index 81de865f4c7c..e6497424cbf6 100644 --- a/arch/riscv/net/bpf_jit_comp32.c +++ b/arch/riscv/net/bpf_jit_comp32.c @@ -1251,6 +1251,10 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, return -1; break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; + case BPF_ST | BPF_MEM | BPF_B: case BPF_ST | BPF_MEM | BPF_H: case BPF_ST | BPF_MEM | BPF_W: diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 87e3bf5b9086..3af4131c22c7 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -939,6 +939,10 @@ out_be: emit_ld(rd, 0, RV_REG_T1, ctx); break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; + /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_B: emit_imm(RV_REG_T1, imm, ctx); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 2ae419f5115a..88419263a89a 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1153,6 +1153,11 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, break; } break; + /* + * BPF_NOSPEC (speculation barrier) + */ + case BPF_ST | BPF_NOSPEC: + break; /* * BPF_ST(X) */ diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 4b8d3c65d266..9a2f20cbd48b 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1287,6 +1287,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) return 1; break; } + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + break; /* ST: *(size *)(dst + off) = imm */ case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 4b951458c9fc..16d76f814e9b 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1219,6 +1219,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, } break; + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + if (boot_cpu_has(X86_FEATURE_XMM2)) + /* Emit 'lfence' */ + EMIT3(0x0F, 0xAE, 0xE8); + break; + /* ST: *(u8*)(dst_reg + off) = imm */ case BPF_ST | BPF_MEM | BPF_B: if (is_ereg(dst_reg)) diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 3da88ded6ee3..3bfda5f502cb 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -1886,6 +1886,12 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, i++; break; } + /* speculation barrier */ + case BPF_ST | BPF_NOSPEC: + if (boot_cpu_has(X86_FEATURE_XMM2)) + /* Emit 'lfence' */ + EMIT3(0x0F, 0xAE, 0xE8); + break; /* ST: *(u8*)(dst_reg + off) = imm */ case BPF_ST | BPF_MEM | BPF_H: case BPF_ST | BPF_MEM | BPF_B: diff --git a/include/linux/filter.h b/include/linux/filter.h index 472f97074da0..83b896044e79 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -73,6 +73,11 @@ struct ctl_table_header; /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 +/* unused opcode to mark speculation barrier for mitigating + * Speculative Store Bypass + */ +#define BPF_NOSPEC 0xc0 + /* As per nm, we expose JITed images as text (code) section for * kallsyms. That way, tools like perf can find it to match * addresses. @@ -390,6 +395,16 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) .off = 0, \ .imm = 0 }) +/* Speculation barrier */ + +#define BPF_ST_NOSPEC() \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_NOSPEC, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0 }) + /* Internal classic blocks for direct assignment */ #define __BPF_STMT(CODE, K) \ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9b1577498373..b1a5fc04492b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -32,6 +32,8 @@ #include #include #include + +#include #include /* Registers */ @@ -1377,6 +1379,7 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) /* Non-UAPI available opcodes. */ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, + [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC, [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B, [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H, [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W, @@ -1621,7 +1624,21 @@ out: COND_JMP(s, JSGE, >=) COND_JMP(s, JSLE, <=) #undef COND_JMP - /* STX and ST and LDX*/ + /* ST, STX and LDX*/ + ST_NOSPEC: + /* Speculation barrier for mitigating Speculative Store Bypass. + * In case of arm64, we rely on the firmware mitigation as + * controlled via the ssbd kernel parameter. Whenever the + * mitigation is enabled, it works for all of the kernel code + * with no need to provide any additional instructions here. + * In case of x86, we use 'lfence' insn for mitigation. We + * reuse preexisting logic from Spectre v1 mitigation that + * happens to produce the required code on x86 for v4 as well. + */ +#ifdef CONFIG_X86 + barrier_nospec(); +#endif + CONT; #define LDST(SIZEOP, SIZE) \ STX_MEM_##SIZEOP: \ *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index bbfc6bb79240..ca3cd9aaa6ce 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -206,15 +206,17 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, verbose(cbs->private_data, "BUG_%02x\n", insn->code); } } else if (class == BPF_ST) { - if (BPF_MODE(insn->code) != BPF_MEM) { + if (BPF_MODE(insn->code) == BPF_MEM) { + verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) { + verbose(cbs->private_data, "(%02x) nospec\n", insn->code); + } else { verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); - return; } - verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->imm); } else if (class == BPF_LDX) { if (BPF_MODE(insn->code) != BPF_MEM) { verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); -- cgit From 2039f26f3aca5b0e419b98f65dd36481337b86ee Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 13 Jul 2021 08:18:31 +0000 Subject: bpf: Fix leakage due to insufficient speculative store bypass mitigation Spectre v4 gadgets make use of memory disambiguation, which is a set of techniques that execute memory access instructions, that is, loads and stores, out of program order; Intel's optimization manual, section 2.4.4.5: A load instruction micro-op may depend on a preceding store. Many microarchitectures block loads until all preceding store addresses are known. The memory disambiguator predicts which loads will not depend on any previous stores. When the disambiguator predicts that a load does not have such a dependency, the load takes its data from the L1 data cache. Eventually, the prediction is verified. If an actual conflict is detected, the load and all succeeding instructions are re-executed. af86ca4e3088 ("bpf: Prevent memory disambiguation attack") tried to mitigate this attack by sanitizing the memory locations through preemptive "fast" (low latency) stores of zero prior to the actual "slow" (high latency) store of a pointer value such that upon dependency misprediction the CPU then speculatively executes the load of the pointer value and retrieves the zero value instead of the attacker controlled scalar value previously stored at that location, meaning, subsequent access in the speculative domain is then redirected to the "zero page". The sanitized preemptive store of zero prior to the actual "slow" store is done through a simple ST instruction based on r10 (frame pointer) with relative offset to the stack location that the verifier has been tracking on the original used register for STX, which does not have to be r10. Thus, there are no memory dependencies for this store, since it's only using r10 and immediate constant of zero; hence af86ca4e3088 /assumed/ a low latency operation. However, a recent attack demonstrated that this mitigation is not sufficient since the preemptive store of zero could also be turned into a "slow" store and is thus bypassed as well: [...] // r2 = oob address (e.g. scalar) // r7 = pointer to map value 31: (7b) *(u64 *)(r10 -16) = r2 // r9 will remain "fast" register, r10 will become "slow" register below 32: (bf) r9 = r10 // JIT maps BPF reg to x86 reg: // r9 -> r15 (callee saved) // r10 -> rbp // train store forward prediction to break dependency link between both r9 // and r10 by evicting them from the predictor's LRU table. 33: (61) r0 = *(u32 *)(r7 +24576) 34: (63) *(u32 *)(r7 +29696) = r0 35: (61) r0 = *(u32 *)(r7 +24580) 36: (63) *(u32 *)(r7 +29700) = r0 37: (61) r0 = *(u32 *)(r7 +24584) 38: (63) *(u32 *)(r7 +29704) = r0 39: (61) r0 = *(u32 *)(r7 +24588) 40: (63) *(u32 *)(r7 +29708) = r0 [...] 543: (61) r0 = *(u32 *)(r7 +25596) 544: (63) *(u32 *)(r7 +30716) = r0 // prepare call to bpf_ringbuf_output() helper. the latter will cause rbp // to spill to stack memory while r13/r14/r15 (all callee saved regs) remain // in hardware registers. rbp becomes slow due to push/pop latency. below is // disasm of bpf_ringbuf_output() helper for better visual context: // // ffffffff8117ee20: 41 54 push r12 // ffffffff8117ee22: 55 push rbp // ffffffff8117ee23: 53 push rbx // ffffffff8117ee24: 48 f7 c1 fc ff ff ff test rcx,0xfffffffffffffffc // ffffffff8117ee2b: 0f 85 af 00 00 00 jne ffffffff8117eee0 <-- jump taken // [...] // ffffffff8117eee0: 49 c7 c4 ea ff ff ff mov r12,0xffffffffffffffea // ffffffff8117eee7: 5b pop rbx // ffffffff8117eee8: 5d pop rbp // ffffffff8117eee9: 4c 89 e0 mov rax,r12 // ffffffff8117eeec: 41 5c pop r12 // ffffffff8117eeee: c3 ret 545: (18) r1 = map[id:4] 547: (bf) r2 = r7 548: (b7) r3 = 0 549: (b7) r4 = 4 550: (85) call bpf_ringbuf_output#194288 // instruction 551 inserted by verifier \ 551: (7a) *(u64 *)(r10 -16) = 0 | /both/ are now slow stores here // storing map value pointer r7 at fp-16 | since value of r10 is "slow". 552: (7b) *(u64 *)(r10 -16) = r7 / // following "fast" read to the same memory location, but due to dependency // misprediction it will speculatively execute before insn 551/552 completes. 553: (79) r2 = *(u64 *)(r9 -16) // in speculative domain contains attacker controlled r2. in non-speculative // domain this contains r7, and thus accesses r7 +0 below. 554: (71) r3 = *(u8 *)(r2 +0) // leak r3 As can be seen, the current speculative store bypass mitigation which the verifier inserts at line 551 is insufficient since /both/, the write of the zero sanitation as well as the map value pointer are a high latency instruction due to prior memory access via push/pop of r10 (rbp) in contrast to the low latency read in line 553 as r9 (r15) which stays in hardware registers. Thus, architecturally, fp-16 is r7, however, microarchitecturally, fp-16 can still be r2. Initial thoughts to address this issue was to track spilled pointer loads from stack and enforce their load via LDX through r10 as well so that /both/ the preemptive store of zero /as well as/ the load use the /same/ register such that a dependency is created between the store and load. However, this option is not sufficient either since it can be bypassed as well under speculation. An updated attack with pointer spill/fills now _all_ based on r10 would look as follows: [...] // r2 = oob address (e.g. scalar) // r7 = pointer to map value [...] // longer store forward prediction training sequence than before. 2062: (61) r0 = *(u32 *)(r7 +25588) 2063: (63) *(u32 *)(r7 +30708) = r0 2064: (61) r0 = *(u32 *)(r7 +25592) 2065: (63) *(u32 *)(r7 +30712) = r0 2066: (61) r0 = *(u32 *)(r7 +25596) 2067: (63) *(u32 *)(r7 +30716) = r0 // store the speculative load address (scalar) this time after the store // forward prediction training. 2068: (7b) *(u64 *)(r10 -16) = r2 // preoccupy the CPU store port by running sequence of dummy stores. 2069: (63) *(u32 *)(r7 +29696) = r0 2070: (63) *(u32 *)(r7 +29700) = r0 2071: (63) *(u32 *)(r7 +29704) = r0 2072: (63) *(u32 *)(r7 +29708) = r0 2073: (63) *(u32 *)(r7 +29712) = r0 2074: (63) *(u32 *)(r7 +29716) = r0 2075: (63) *(u32 *)(r7 +29720) = r0 2076: (63) *(u32 *)(r7 +29724) = r0 2077: (63) *(u32 *)(r7 +29728) = r0 2078: (63) *(u32 *)(r7 +29732) = r0 2079: (63) *(u32 *)(r7 +29736) = r0 2080: (63) *(u32 *)(r7 +29740) = r0 2081: (63) *(u32 *)(r7 +29744) = r0 2082: (63) *(u32 *)(r7 +29748) = r0 2083: (63) *(u32 *)(r7 +29752) = r0 2084: (63) *(u32 *)(r7 +29756) = r0 2085: (63) *(u32 *)(r7 +29760) = r0 2086: (63) *(u32 *)(r7 +29764) = r0 2087: (63) *(u32 *)(r7 +29768) = r0 2088: (63) *(u32 *)(r7 +29772) = r0 2089: (63) *(u32 *)(r7 +29776) = r0 2090: (63) *(u32 *)(r7 +29780) = r0 2091: (63) *(u32 *)(r7 +29784) = r0 2092: (63) *(u32 *)(r7 +29788) = r0 2093: (63) *(u32 *)(r7 +29792) = r0 2094: (63) *(u32 *)(r7 +29796) = r0 2095: (63) *(u32 *)(r7 +29800) = r0 2096: (63) *(u32 *)(r7 +29804) = r0 2097: (63) *(u32 *)(r7 +29808) = r0 2098: (63) *(u32 *)(r7 +29812) = r0 // overwrite scalar with dummy pointer; same as before, also including the // sanitation store with 0 from the current mitigation by the verifier. 2099: (7a) *(u64 *)(r10 -16) = 0 | /both/ are now slow stores here 2100: (7b) *(u64 *)(r10 -16) = r7 | since store unit is still busy. // load from stack intended to bypass stores. 2101: (79) r2 = *(u64 *)(r10 -16) 2102: (71) r3 = *(u8 *)(r2 +0) // leak r3 [...] Looking at the CPU microarchitecture, the scheduler might issue loads (such as seen in line 2101) before stores (line 2099,2100) because the load execution units become available while the store execution unit is still busy with the sequence of dummy stores (line 2069-2098). And so the load may use the prior stored scalar from r2 at address r10 -16 for speculation. The updated attack may work less reliable on CPU microarchitectures where loads and stores share execution resources. This concludes that the sanitizing with zero stores from af86ca4e3088 ("bpf: Prevent memory disambiguation attack") is insufficient. Moreover, the detection of stack reuse from af86ca4e3088 where previously data (STACK_MISC) has been written to a given stack slot where a pointer value is now to be stored does not have sufficient coverage as precondition for the mitigation either; for several reasons outlined as follows: 1) Stack content from prior program runs could still be preserved and is therefore not "random", best example is to split a speculative store bypass attack between tail calls, program A would prepare and store the oob address at a given stack slot and then tail call into program B which does the "slow" store of a pointer to the stack with subsequent "fast" read. From program B PoV such stack slot type is STACK_INVALID, and therefore also must be subject to mitigation. 2) The STACK_SPILL must not be coupled to register_is_const(&stack->spilled_ptr) condition, for example, the previous content of that memory location could also be a pointer to map or map value. Without the fix, a speculative store bypass is not mitigated in such precondition and can then lead to a type confusion in the speculative domain leaking kernel memory near these pointer types. While brainstorming on various alternative mitigation possibilities, we also stumbled upon a retrospective from Chrome developers [0]: [...] For variant 4, we implemented a mitigation to zero the unused memory of the heap prior to allocation, which cost about 1% when done concurrently and 4% for scavenging. Variant 4 defeats everything we could think of. We explored more mitigations for variant 4 but the threat proved to be more pervasive and dangerous than we anticipated. For example, stack slots used by the register allocator in the optimizing compiler could be subject to type confusion, leading to pointer crafting. Mitigating type confusion for stack slots alone would have required a complete redesign of the backend of the optimizing compiler, perhaps man years of work, without a guarantee of completeness. [...] From BPF side, the problem space is reduced, however, options are rather limited. One idea that has been explored was to xor-obfuscate pointer spills to the BPF stack: [...] // preoccupy the CPU store port by running sequence of dummy stores. [...] 2106: (63) *(u32 *)(r7 +29796) = r0 2107: (63) *(u32 *)(r7 +29800) = r0 2108: (63) *(u32 *)(r7 +29804) = r0 2109: (63) *(u32 *)(r7 +29808) = r0 2110: (63) *(u32 *)(r7 +29812) = r0 // overwrite scalar with dummy pointer; xored with random 'secret' value // of 943576462 before store ... 2111: (b4) w11 = 943576462 2112: (af) r11 ^= r7 2113: (7b) *(u64 *)(r10 -16) = r11 2114: (79) r11 = *(u64 *)(r10 -16) 2115: (b4) w2 = 943576462 2116: (af) r2 ^= r11 // ... and restored with the same 'secret' value with the help of AX reg. 2117: (71) r3 = *(u8 *)(r2 +0) [...] While the above would not prevent speculation, it would make data leakage infeasible by directing it to random locations. In order to be effective and prevent type confusion under speculation, such random secret would have to be regenerated for each store. The additional complexity involved for a tracking mechanism that prevents jumps such that restoring spilled pointers would not get corrupted is not worth the gain for unprivileged. Hence, the fix in here eventually opted for emitting a non-public BPF_ST | BPF_NOSPEC instruction which the x86 JIT translates into a lfence opcode. Inserting the latter in between the store and load instruction is one of the mitigations options [1]. The x86 instruction manual notes: [...] An LFENCE that follows an instruction that stores to memory might complete before the data being stored have become globally visible. [...] The latter meaning that the preceding store instruction finished execution and the store is at minimum guaranteed to be in the CPU's store queue, but it's not guaranteed to be in that CPU's L1 cache at that point (globally visible). The latter would only be guaranteed via sfence. So the load which is guaranteed to execute after the lfence for that local CPU would have to rely on store-to-load forwarding. [2], in section 2.3 on store buffers says: [...] For every store operation that is added to the ROB, an entry is allocated in the store buffer. This entry requires both the virtual and physical address of the target. Only if there is no free entry in the store buffer, the frontend stalls until there is an empty slot available in the store buffer again. Otherwise, the CPU can immediately continue adding subsequent instructions to the ROB and execute them out of order. On Intel CPUs, the store buffer has up to 56 entries. [...] One small upside on the fix is that it lifts constraints from af86ca4e3088 where the sanitize_stack_off relative to r10 must be the same when coming from different paths. The BPF_ST | BPF_NOSPEC gets emitted after a BPF_STX or BPF_ST instruction. This happens either when we store a pointer or data value to the BPF stack for the first time, or upon later pointer spills. The former needs to be enforced since otherwise stale stack data could be leaked under speculation as outlined earlier. For non-x86 JITs the BPF_ST | BPF_NOSPEC mapping is currently optimized away, but others could emit a speculation barrier as well if necessary. For real-world unprivileged programs e.g. generated by LLVM, pointer spill/fill is only generated upon register pressure and LLVM only tries to do that for pointers which are not used often. The program main impact will be the initial BPF_ST | BPF_NOSPEC sanitation for the STACK_INVALID case when the first write to a stack slot occurs e.g. upon map lookup. In future we might refine ways to mitigate the latter cost. [0] https://arxiv.org/pdf/1902.05178.pdf [1] https://msrc-blog.microsoft.com/2018/05/21/analysis-and-mitigation-of-speculative-store-bypass-cve-2018-3639/ [2] https://arxiv.org/pdf/1905.05725.pdf Fixes: af86ca4e3088 ("bpf: Prevent memory disambiguation attack") Fixes: f7cf25b2026d ("bpf: track spill/fill of constants") Co-developed-by: Piotr Krysiuk Co-developed-by: Benedict Schlueter Signed-off-by: Daniel Borkmann Signed-off-by: Piotr Krysiuk Signed-off-by: Benedict Schlueter Acked-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- kernel/bpf/verifier.c | 87 ++++++++++++++++---------------------------- 2 files changed, 33 insertions(+), 56 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 7ba7e800d472..828d08afeee0 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -340,8 +340,8 @@ struct bpf_insn_aux_data { }; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ - int sanitize_stack_off; /* stack slot to be cleared */ u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ + bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ bool zext_dst; /* this insn zero extends dst reg */ u8 alu_state; /* used in combination with alu_limit */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 657062cb4d85..f9bda5476ea5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2610,6 +2610,19 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, cur = env->cur_state->frame[env->cur_state->curframe]; if (value_regno >= 0) reg = &cur->regs[value_regno]; + if (!env->bypass_spec_v4) { + bool sanitize = reg && is_spillable_regtype(reg->type); + + for (i = 0; i < size; i++) { + if (state->stack[spi].slot_type[i] == STACK_INVALID) { + sanitize = true; + break; + } + } + + if (sanitize) + env->insn_aux_data[insn_idx].sanitize_stack_spill = true; + } if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) && !register_is_null(reg) && env->bpf_capable) { @@ -2632,47 +2645,10 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, verbose(env, "invalid size of register spill\n"); return -EACCES; } - if (state != cur && reg->type == PTR_TO_STACK) { verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); return -EINVAL; } - - if (!env->bypass_spec_v4) { - bool sanitize = false; - - if (state->stack[spi].slot_type[0] == STACK_SPILL && - register_is_const(&state->stack[spi].spilled_ptr)) - sanitize = true; - for (i = 0; i < BPF_REG_SIZE; i++) - if (state->stack[spi].slot_type[i] == STACK_MISC) { - sanitize = true; - break; - } - if (sanitize) { - int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; - int soff = (-spi - 1) * BPF_REG_SIZE; - - /* detected reuse of integer stack slot with a pointer - * which means either llvm is reusing stack slot or - * an attacker is trying to exploit CVE-2018-3639 - * (speculative store bypass) - * Have to sanitize that slot with preemptive - * store of zero. - */ - if (*poff && *poff != soff) { - /* disallow programs where single insn stores - * into two different stack slots, since verifier - * cannot sanitize them - */ - verbose(env, - "insn %d cannot access two stack slots fp%d and fp%d", - insn_idx, *poff, soff); - return -EINVAL; - } - *poff = soff; - } - } save_register_state(state, spi, reg); } else { u8 type = STACK_MISC; @@ -11913,35 +11889,33 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { bpf_convert_ctx_access_t convert_ctx_access; + bool ctx_access; if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || insn->code == (BPF_LDX | BPF_MEM | BPF_H) || insn->code == (BPF_LDX | BPF_MEM | BPF_W) || - insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) + insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) { type = BPF_READ; - else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || - insn->code == (BPF_STX | BPF_MEM | BPF_H) || - insn->code == (BPF_STX | BPF_MEM | BPF_W) || - insn->code == (BPF_STX | BPF_MEM | BPF_DW)) + ctx_access = true; + } else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || + insn->code == (BPF_STX | BPF_MEM | BPF_H) || + insn->code == (BPF_STX | BPF_MEM | BPF_W) || + insn->code == (BPF_STX | BPF_MEM | BPF_DW) || + insn->code == (BPF_ST | BPF_MEM | BPF_B) || + insn->code == (BPF_ST | BPF_MEM | BPF_H) || + insn->code == (BPF_ST | BPF_MEM | BPF_W) || + insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { type = BPF_WRITE; - else + ctx_access = BPF_CLASS(insn->code) == BPF_STX; + } else { continue; + } if (type == BPF_WRITE && - env->insn_aux_data[i + delta].sanitize_stack_off) { + env->insn_aux_data[i + delta].sanitize_stack_spill) { struct bpf_insn patch[] = { - /* Sanitize suspicious stack slot with zero. - * There are no memory dependencies for this store, - * since it's only using frame pointer and immediate - * constant of zero - */ - BPF_ST_MEM(BPF_DW, BPF_REG_FP, - env->insn_aux_data[i + delta].sanitize_stack_off, - 0), - /* the original STX instruction will immediately - * overwrite the same stack slot with appropriate value - */ *insn, + BPF_ST_NOSPEC(), }; cnt = ARRAY_SIZE(patch); @@ -11955,6 +11929,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) continue; } + if (!ctx_access) + continue; + switch (env->insn_aux_data[i + delta].ptr_type) { case PTR_TO_CTX: if (!ops->convert_ctx_access) -- cgit From 40e159403896f7d55c98f858d0b20fee1d941fa4 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 2 Aug 2021 12:21:30 +0100 Subject: mhi: Fix networking tree build. Signed-off-by: David S. Miller --- include/linux/mhi.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 944aa3aa3035..5e08468854db 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -719,8 +719,13 @@ void mhi_device_put(struct mhi_device *mhi_dev); * host and device execution environments match and * channels are in a DISABLED state. * @mhi_dev: Device associated with the channels + * @flags: MHI channel flags */ -int mhi_prepare_for_transfer(struct mhi_device *mhi_dev); +int mhi_prepare_for_transfer(struct mhi_device *mhi_dev, + unsigned int flags); + +/* Automatically allocate and queue inbound buffers */ +#define MHI_CH_INBOUND_ALLOC_BUFS BIT(0) /** * mhi_unprepare_from_transfer - Reset UL and DL channels for data transfer. -- cgit From 1c69d7cf4a8b6b6cfd920a1e809f1cd33ae4369c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 2 Aug 2021 07:30:29 -0700 Subject: Revert "mhi: Fix networking tree build." This reverts commit 40e159403896f7d55c98f858d0b20fee1d941fa4. Looks like this commit breaks the build for me. Signed-off-by: Jakub Kicinski --- include/linux/mhi.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 5e08468854db..944aa3aa3035 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -719,13 +719,8 @@ void mhi_device_put(struct mhi_device *mhi_dev); * host and device execution environments match and * channels are in a DISABLED state. * @mhi_dev: Device associated with the channels - * @flags: MHI channel flags */ -int mhi_prepare_for_transfer(struct mhi_device *mhi_dev, - unsigned int flags); - -/* Automatically allocate and queue inbound buffers */ -#define MHI_CH_INBOUND_ALLOC_BUFS BIT(0) +int mhi_prepare_for_transfer(struct mhi_device *mhi_dev); /** * mhi_unprepare_from_transfer - Reset UL and DL channels for data transfer. -- cgit From f01639589e252a6f72c04716e1b5f9bb10e2debc Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Fri, 23 Jul 2021 19:54:46 +0800 Subject: soundwire: move intel sdw register definitions to sdw_intel.h Those Intel sdw registers will be used by ASoC SOF drivers in the following commits. So move those definitions to sdw_intel.h and it can be visible to SOF drivers. Signed-off-by: Bard Liao Reviewed-by: Pierre-Louis Bossart Reviewed-by: Ranjani Sridharan Acked-By: Vinod Koul Link: https://lore.kernel.org/r/20210723115451.7245-2-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- drivers/soundwire/intel.c | 74 ---------------------------------- drivers/soundwire/intel_init.c | 6 --- include/linux/soundwire/sdw_intel.h | 79 +++++++++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 80 deletions(-) (limited to 'include/linux') diff --git a/drivers/soundwire/intel.c b/drivers/soundwire/intel.c index c11e3d8cd308..15668d6fecd6 100644 --- a/drivers/soundwire/intel.c +++ b/drivers/soundwire/intel.c @@ -40,80 +40,6 @@ static int md_flags; module_param_named(sdw_md_flags, md_flags, int, 0444); MODULE_PARM_DESC(sdw_md_flags, "SoundWire Intel Master device flags (0x0 all off)"); -/* Intel SHIM Registers Definition */ -#define SDW_SHIM_LCAP 0x0 -#define SDW_SHIM_LCTL 0x4 -#define SDW_SHIM_IPPTR 0x8 -#define SDW_SHIM_SYNC 0xC - -#define SDW_SHIM_CTLSCAP(x) (0x010 + 0x60 * (x)) -#define SDW_SHIM_CTLS0CM(x) (0x012 + 0x60 * (x)) -#define SDW_SHIM_CTLS1CM(x) (0x014 + 0x60 * (x)) -#define SDW_SHIM_CTLS2CM(x) (0x016 + 0x60 * (x)) -#define SDW_SHIM_CTLS3CM(x) (0x018 + 0x60 * (x)) -#define SDW_SHIM_PCMSCAP(x) (0x020 + 0x60 * (x)) - -#define SDW_SHIM_PCMSYCHM(x, y) (0x022 + (0x60 * (x)) + (0x2 * (y))) -#define SDW_SHIM_PCMSYCHC(x, y) (0x042 + (0x60 * (x)) + (0x2 * (y))) -#define SDW_SHIM_PDMSCAP(x) (0x062 + 0x60 * (x)) -#define SDW_SHIM_IOCTL(x) (0x06C + 0x60 * (x)) -#define SDW_SHIM_CTMCTL(x) (0x06E + 0x60 * (x)) - -#define SDW_SHIM_WAKEEN 0x190 -#define SDW_SHIM_WAKESTS 0x192 - -#define SDW_SHIM_LCTL_SPA BIT(0) -#define SDW_SHIM_LCTL_SPA_MASK GENMASK(3, 0) -#define SDW_SHIM_LCTL_CPA BIT(8) -#define SDW_SHIM_LCTL_CPA_MASK GENMASK(11, 8) - -#define SDW_SHIM_SYNC_SYNCPRD_VAL_24 (24000 / SDW_CADENCE_GSYNC_KHZ - 1) -#define SDW_SHIM_SYNC_SYNCPRD_VAL_38_4 (38400 / SDW_CADENCE_GSYNC_KHZ - 1) -#define SDW_SHIM_SYNC_SYNCPRD GENMASK(14, 0) -#define SDW_SHIM_SYNC_SYNCCPU BIT(15) -#define SDW_SHIM_SYNC_CMDSYNC_MASK GENMASK(19, 16) -#define SDW_SHIM_SYNC_CMDSYNC BIT(16) -#define SDW_SHIM_SYNC_SYNCGO BIT(24) - -#define SDW_SHIM_PCMSCAP_ISS GENMASK(3, 0) -#define SDW_SHIM_PCMSCAP_OSS GENMASK(7, 4) -#define SDW_SHIM_PCMSCAP_BSS GENMASK(12, 8) - -#define SDW_SHIM_PCMSYCM_LCHN GENMASK(3, 0) -#define SDW_SHIM_PCMSYCM_HCHN GENMASK(7, 4) -#define SDW_SHIM_PCMSYCM_STREAM GENMASK(13, 8) -#define SDW_SHIM_PCMSYCM_DIR BIT(15) - -#define SDW_SHIM_PDMSCAP_ISS GENMASK(3, 0) -#define SDW_SHIM_PDMSCAP_OSS GENMASK(7, 4) -#define SDW_SHIM_PDMSCAP_BSS GENMASK(12, 8) -#define SDW_SHIM_PDMSCAP_CPSS GENMASK(15, 13) - -#define SDW_SHIM_IOCTL_MIF BIT(0) -#define SDW_SHIM_IOCTL_CO BIT(1) -#define SDW_SHIM_IOCTL_COE BIT(2) -#define SDW_SHIM_IOCTL_DO BIT(3) -#define SDW_SHIM_IOCTL_DOE BIT(4) -#define SDW_SHIM_IOCTL_BKE BIT(5) -#define SDW_SHIM_IOCTL_WPDD BIT(6) -#define SDW_SHIM_IOCTL_CIBD BIT(8) -#define SDW_SHIM_IOCTL_DIBD BIT(9) - -#define SDW_SHIM_CTMCTL_DACTQE BIT(0) -#define SDW_SHIM_CTMCTL_DODS BIT(1) -#define SDW_SHIM_CTMCTL_DOAIS GENMASK(4, 3) - -#define SDW_SHIM_WAKEEN_ENABLE BIT(0) -#define SDW_SHIM_WAKESTS_STATUS BIT(0) - -/* Intel ALH Register definitions */ -#define SDW_ALH_STRMZCFG(x) (0x000 + (0x4 * (x))) -#define SDW_ALH_NUM_STREAMS 64 - -#define SDW_ALH_STRMZCFG_DMAT_VAL 0x3 -#define SDW_ALH_STRMZCFG_DMAT GENMASK(7, 0) -#define SDW_ALH_STRMZCFG_CHN GENMASK(19, 16) - enum intel_pdi_type { INTEL_PDI_IN = 0, INTEL_PDI_OUT = 1, diff --git a/drivers/soundwire/intel_init.c b/drivers/soundwire/intel_init.c index 9e283bef53d2..03ff69ab1797 100644 --- a/drivers/soundwire/intel_init.c +++ b/drivers/soundwire/intel_init.c @@ -18,12 +18,6 @@ #include "cadence_master.h" #include "intel.h" -#define SDW_SHIM_LCAP 0x0 -#define SDW_SHIM_BASE 0x2C000 -#define SDW_ALH_BASE 0x2C800 -#define SDW_LINK_BASE 0x30000 -#define SDW_LINK_SIZE 0x10000 - static void intel_link_dev_release(struct device *dev) { struct auxiliary_device *auxdev = to_auxiliary_dev(dev); diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 1ebea7764011..7fce6aee0c36 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -7,6 +7,85 @@ #include #include +#define SDW_SHIM_BASE 0x2C000 +#define SDW_ALH_BASE 0x2C800 +#define SDW_LINK_BASE 0x30000 +#define SDW_LINK_SIZE 0x10000 + +/* Intel SHIM Registers Definition */ +#define SDW_SHIM_LCAP 0x0 +#define SDW_SHIM_LCTL 0x4 +#define SDW_SHIM_IPPTR 0x8 +#define SDW_SHIM_SYNC 0xC + +#define SDW_SHIM_CTLSCAP(x) (0x010 + 0x60 * (x)) +#define SDW_SHIM_CTLS0CM(x) (0x012 + 0x60 * (x)) +#define SDW_SHIM_CTLS1CM(x) (0x014 + 0x60 * (x)) +#define SDW_SHIM_CTLS2CM(x) (0x016 + 0x60 * (x)) +#define SDW_SHIM_CTLS3CM(x) (0x018 + 0x60 * (x)) +#define SDW_SHIM_PCMSCAP(x) (0x020 + 0x60 * (x)) + +#define SDW_SHIM_PCMSYCHM(x, y) (0x022 + (0x60 * (x)) + (0x2 * (y))) +#define SDW_SHIM_PCMSYCHC(x, y) (0x042 + (0x60 * (x)) + (0x2 * (y))) +#define SDW_SHIM_PDMSCAP(x) (0x062 + 0x60 * (x)) +#define SDW_SHIM_IOCTL(x) (0x06C + 0x60 * (x)) +#define SDW_SHIM_CTMCTL(x) (0x06E + 0x60 * (x)) + +#define SDW_SHIM_WAKEEN 0x190 +#define SDW_SHIM_WAKESTS 0x192 + +#define SDW_SHIM_LCTL_SPA BIT(0) +#define SDW_SHIM_LCTL_SPA_MASK GENMASK(3, 0) +#define SDW_SHIM_LCTL_CPA BIT(8) +#define SDW_SHIM_LCTL_CPA_MASK GENMASK(11, 8) + +#define SDW_SHIM_SYNC_SYNCPRD_VAL_24 (24000 / SDW_CADENCE_GSYNC_KHZ - 1) +#define SDW_SHIM_SYNC_SYNCPRD_VAL_38_4 (38400 / SDW_CADENCE_GSYNC_KHZ - 1) +#define SDW_SHIM_SYNC_SYNCPRD GENMASK(14, 0) +#define SDW_SHIM_SYNC_SYNCCPU BIT(15) +#define SDW_SHIM_SYNC_CMDSYNC_MASK GENMASK(19, 16) +#define SDW_SHIM_SYNC_CMDSYNC BIT(16) +#define SDW_SHIM_SYNC_SYNCGO BIT(24) + +#define SDW_SHIM_PCMSCAP_ISS GENMASK(3, 0) +#define SDW_SHIM_PCMSCAP_OSS GENMASK(7, 4) +#define SDW_SHIM_PCMSCAP_BSS GENMASK(12, 8) + +#define SDW_SHIM_PCMSYCM_LCHN GENMASK(3, 0) +#define SDW_SHIM_PCMSYCM_HCHN GENMASK(7, 4) +#define SDW_SHIM_PCMSYCM_STREAM GENMASK(13, 8) +#define SDW_SHIM_PCMSYCM_DIR BIT(15) + +#define SDW_SHIM_PDMSCAP_ISS GENMASK(3, 0) +#define SDW_SHIM_PDMSCAP_OSS GENMASK(7, 4) +#define SDW_SHIM_PDMSCAP_BSS GENMASK(12, 8) +#define SDW_SHIM_PDMSCAP_CPSS GENMASK(15, 13) + +#define SDW_SHIM_IOCTL_MIF BIT(0) +#define SDW_SHIM_IOCTL_CO BIT(1) +#define SDW_SHIM_IOCTL_COE BIT(2) +#define SDW_SHIM_IOCTL_DO BIT(3) +#define SDW_SHIM_IOCTL_DOE BIT(4) +#define SDW_SHIM_IOCTL_BKE BIT(5) +#define SDW_SHIM_IOCTL_WPDD BIT(6) +#define SDW_SHIM_IOCTL_CIBD BIT(8) +#define SDW_SHIM_IOCTL_DIBD BIT(9) + +#define SDW_SHIM_CTMCTL_DACTQE BIT(0) +#define SDW_SHIM_CTMCTL_DODS BIT(1) +#define SDW_SHIM_CTMCTL_DOAIS GENMASK(4, 3) + +#define SDW_SHIM_WAKEEN_ENABLE BIT(0) +#define SDW_SHIM_WAKESTS_STATUS BIT(0) + +/* Intel ALH Register definitions */ +#define SDW_ALH_STRMZCFG(x) (0x000 + (0x4 * (x))) +#define SDW_ALH_NUM_STREAMS 64 + +#define SDW_ALH_STRMZCFG_DMAT_VAL 0x3 +#define SDW_ALH_STRMZCFG_DMAT GENMASK(7, 0) +#define SDW_ALH_STRMZCFG_CHN GENMASK(19, 16) + /** * struct sdw_intel_stream_params_data: configuration passed during * the @params_stream callback, e.g. for interaction with DSP -- cgit From 60e9feb781dfe84158b4ec7a4d61c5103e96e6f3 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Fri, 23 Jul 2021 19:54:51 +0800 Subject: soundwire: intel: introduce shim and alh base shim base and alh base are platform-dependent. Adding these two parameters allows us to use different shim/alh base for each platform. Signed-off-by: Bard Liao Reviewed-by: Pierre-Louis Bossart Reviewed-by: Ranjani Sridharan Acked-By: Vinod Koul Link: https://lore.kernel.org/r/20210723115451.7245-7-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- drivers/soundwire/intel_init.c | 8 +++++--- include/linux/soundwire/sdw_intel.h | 8 ++++++++ sound/soc/sof/intel/hda.c | 2 ++ 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/drivers/soundwire/intel_init.c b/drivers/soundwire/intel_init.c index 03ff69ab1797..e329022e1669 100644 --- a/drivers/soundwire/intel_init.c +++ b/drivers/soundwire/intel_init.c @@ -63,8 +63,8 @@ static struct sdw_intel_link_dev *intel_link_dev_register(struct sdw_intel_res * link->mmio_base = res->mmio_base; link->registers = res->mmio_base + SDW_LINK_BASE + (SDW_LINK_SIZE * link_id); - link->shim = res->mmio_base + SDW_SHIM_BASE; - link->alh = res->mmio_base + SDW_ALH_BASE; + link->shim = res->mmio_base + res->shim_base; + link->alh = res->mmio_base + res->alh_base; link->ops = res->ops; link->dev = res->dev; @@ -214,6 +214,8 @@ static struct sdw_intel_ctx } ctx->mmio_base = res->mmio_base; + ctx->shim_base = res->shim_base; + ctx->alh_base = res->alh_base; ctx->link_mask = res->link_mask; ctx->handle = res->handle; mutex_init(&ctx->shim_lock); @@ -302,7 +304,7 @@ sdw_intel_startup_controller(struct sdw_intel_ctx *ctx) return -EINVAL; /* Check SNDWLCAP.LCOUNT */ - caps = ioread32(ctx->mmio_base + SDW_SHIM_BASE + SDW_SHIM_LCAP); + caps = ioread32(ctx->mmio_base + ctx->shim_base + SDW_SHIM_LCAP); caps &= GENMASK(2, 0); /* Check HW supported vs property value */ diff --git a/include/linux/soundwire/sdw_intel.h b/include/linux/soundwire/sdw_intel.h index 7fce6aee0c36..8a463b8fc12a 100644 --- a/include/linux/soundwire/sdw_intel.h +++ b/include/linux/soundwire/sdw_intel.h @@ -195,6 +195,8 @@ struct sdw_intel_slave_id { * @link_list: list to handle interrupts across all links * @shim_lock: mutex to handle concurrent rmw access to shared SHIM registers. * @shim_mask: flags to track initialization of SHIM shared registers + * @shim_base: sdw shim base. + * @alh_base: sdw alh base. */ struct sdw_intel_ctx { int count; @@ -207,6 +209,8 @@ struct sdw_intel_ctx { struct list_head link_list; struct mutex shim_lock; /* lock for access to shared SHIM registers */ u32 shim_mask; + u32 shim_base; + u32 alh_base; }; /** @@ -225,6 +229,8 @@ struct sdw_intel_ctx { * machine-specific quirks are handled in the DSP driver. * @clock_stop_quirks: mask array of possible behaviors requested by the * DSP driver. The quirks are common for all links for now. + * @shim_base: sdw shim base. + * @alh_base: sdw alh base. */ struct sdw_intel_res { int count; @@ -236,6 +242,8 @@ struct sdw_intel_res { struct device *dev; u32 link_mask; u32 clock_stop_quirks; + u32 shim_base; + u32 alh_base; }; /* diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c index c979581c6812..b4e35fbbe693 100644 --- a/sound/soc/sof/intel/hda.c +++ b/sound/soc/sof/intel/hda.c @@ -166,6 +166,8 @@ static int hda_sdw_probe(struct snd_sof_dev *sdev) memset(&res, 0, sizeof(res)); res.mmio_base = sdev->bar[HDA_DSP_BAR]; + res.shim_base = hdev->desc->sdw_shim_base; + res.alh_base = hdev->desc->sdw_alh_base; res.irq = sdev->ipc_irq; res.handle = hdev->info.handle; res.parent = sdev->dev; -- cgit From ce78ffa3ef1681065ba451cfd545da6126f5ca88 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 3 Aug 2021 11:14:03 +0100 Subject: net: really fix the build... Signed-off-by: David S. Miller --- drivers/bus/mhi/core/internal.h | 2 +- drivers/bus/mhi/core/main.c | 9 ++++++--- drivers/net/mhi/net.c | 2 +- drivers/net/wwan/mhi_wwan_ctrl.c | 2 +- include/linux/mhi.h | 7 ++++++- net/qrtr/mhi.c | 16 +++++++++++++++- 6 files changed, 30 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/bus/mhi/core/internal.h b/drivers/bus/mhi/core/internal.h index 5b9ea66b92dc..bc239a11aa69 100644 --- a/drivers/bus/mhi/core/internal.h +++ b/drivers/bus/mhi/core/internal.h @@ -682,7 +682,7 @@ void mhi_rddm_prepare(struct mhi_controller *mhi_cntrl, struct image_info *img_info); void mhi_fw_load_handler(struct mhi_controller *mhi_cntrl); int mhi_prepare_channel(struct mhi_controller *mhi_cntrl, - struct mhi_chan *mhi_chan); + struct mhi_chan *mhi_chan, unsigned int flags); int mhi_init_chan_ctxt(struct mhi_controller *mhi_cntrl, struct mhi_chan *mhi_chan); void mhi_deinit_chan_ctxt(struct mhi_controller *mhi_cntrl, diff --git a/drivers/bus/mhi/core/main.c b/drivers/bus/mhi/core/main.c index fc9196f11cb7..84448233f64c 100644 --- a/drivers/bus/mhi/core/main.c +++ b/drivers/bus/mhi/core/main.c @@ -1430,7 +1430,7 @@ exit_unprepare_channel: } int mhi_prepare_channel(struct mhi_controller *mhi_cntrl, - struct mhi_chan *mhi_chan) + struct mhi_chan *mhi_chan, unsigned int flags) { int ret = 0; struct device *dev = &mhi_chan->mhi_dev->dev; @@ -1455,6 +1455,9 @@ int mhi_prepare_channel(struct mhi_controller *mhi_cntrl, if (ret) goto error_pm_state; + if (mhi_chan->dir == DMA_FROM_DEVICE) + mhi_chan->pre_alloc = !!(flags & MHI_CH_INBOUND_ALLOC_BUFS); + /* Pre-allocate buffer for xfer ring */ if (mhi_chan->pre_alloc) { int nr_el = get_nr_avail_ring_elements(mhi_cntrl, @@ -1610,7 +1613,7 @@ void mhi_reset_chan(struct mhi_controller *mhi_cntrl, struct mhi_chan *mhi_chan) } /* Move channel to start state */ -int mhi_prepare_for_transfer(struct mhi_device *mhi_dev) +int mhi_prepare_for_transfer(struct mhi_device *mhi_dev, unsigned int flags) { int ret, dir; struct mhi_controller *mhi_cntrl = mhi_dev->mhi_cntrl; @@ -1621,7 +1624,7 @@ int mhi_prepare_for_transfer(struct mhi_device *mhi_dev) if (!mhi_chan) continue; - ret = mhi_prepare_channel(mhi_cntrl, mhi_chan); + ret = mhi_prepare_channel(mhi_cntrl, mhi_chan, flags); if (ret) goto error_open_chan; } diff --git a/drivers/net/mhi/net.c b/drivers/net/mhi/net.c index e60e38c1f09d..11be6bcdd551 100644 --- a/drivers/net/mhi/net.c +++ b/drivers/net/mhi/net.c @@ -335,7 +335,7 @@ static int mhi_net_newlink(void *ctxt, struct net_device *ndev, u32 if_id, u64_stats_init(&mhi_netdev->stats.tx_syncp); /* Start MHI channels */ - err = mhi_prepare_for_transfer(mhi_dev); + err = mhi_prepare_for_transfer(mhi_dev, 0); if (err) goto out_err; diff --git a/drivers/net/wwan/mhi_wwan_ctrl.c b/drivers/net/wwan/mhi_wwan_ctrl.c index 1bc6b69aa530..1e18420ce404 100644 --- a/drivers/net/wwan/mhi_wwan_ctrl.c +++ b/drivers/net/wwan/mhi_wwan_ctrl.c @@ -110,7 +110,7 @@ static int mhi_wwan_ctrl_start(struct wwan_port *port) int ret; /* Start mhi device's channel(s) */ - ret = mhi_prepare_for_transfer(mhiwwan->mhi_dev); + ret = mhi_prepare_for_transfer(mhiwwan->mhi_dev, 0); if (ret) return ret; diff --git a/include/linux/mhi.h b/include/linux/mhi.h index 944aa3aa3035..5e08468854db 100644 --- a/include/linux/mhi.h +++ b/include/linux/mhi.h @@ -719,8 +719,13 @@ void mhi_device_put(struct mhi_device *mhi_dev); * host and device execution environments match and * channels are in a DISABLED state. * @mhi_dev: Device associated with the channels + * @flags: MHI channel flags */ -int mhi_prepare_for_transfer(struct mhi_device *mhi_dev); +int mhi_prepare_for_transfer(struct mhi_device *mhi_dev, + unsigned int flags); + +/* Automatically allocate and queue inbound buffers */ +#define MHI_CH_INBOUND_ALLOC_BUFS BIT(0) /** * mhi_unprepare_from_transfer - Reset UL and DL channels for data transfer. diff --git a/net/qrtr/mhi.c b/net/qrtr/mhi.c index fa611678af05..1dc955ca57d3 100644 --- a/net/qrtr/mhi.c +++ b/net/qrtr/mhi.c @@ -15,6 +15,7 @@ struct qrtr_mhi_dev { struct qrtr_endpoint ep; struct mhi_device *mhi_dev; struct device *dev; + struct completion ready; }; /* From MHI to QRTR */ @@ -50,6 +51,10 @@ static int qcom_mhi_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb) struct qrtr_mhi_dev *qdev = container_of(ep, struct qrtr_mhi_dev, ep); int rc; + rc = wait_for_completion_interruptible(&qdev->ready); + if (rc) + goto free_skb; + if (skb->sk) sock_hold(skb->sk); @@ -79,7 +84,7 @@ static int qcom_mhi_qrtr_probe(struct mhi_device *mhi_dev, int rc; /* start channels */ - rc = mhi_prepare_for_transfer(mhi_dev); + rc = mhi_prepare_for_transfer(mhi_dev, 0); if (rc) return rc; @@ -96,6 +101,15 @@ static int qcom_mhi_qrtr_probe(struct mhi_device *mhi_dev, if (rc) return rc; + /* start channels */ + rc = mhi_prepare_for_transfer(mhi_dev, MHI_CH_INBOUND_ALLOC_BUFS); + if (rc) { + qrtr_endpoint_unregister(&qdev->ep); + dev_set_drvdata(&mhi_dev->dev, NULL); + return rc; + } + + complete_all(&qdev->ready); dev_dbg(qdev->dev, "Qualcomm MHI QRTR driver probed\n"); return 0; -- cgit From 5f7b51bf09baca8e4f80cbe879536842bafb5f31 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Wed, 28 Jul 2021 17:01:15 +0200 Subject: netfilter: ipset: Limit the maximal range of consecutive elements to add/delete The range size of consecutive elements were not limited. Thus one could define a huge range which may result soft lockup errors due to the long execution time. Now the range size is limited to 2^20 entries. Reported-by: Brad Spengler Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 3 +++ net/netfilter/ipset/ip_set_hash_ip.c | 9 ++++++++- net/netfilter/ipset/ip_set_hash_ipmark.c | 10 +++++++++- net/netfilter/ipset/ip_set_hash_ipport.c | 3 +++ net/netfilter/ipset/ip_set_hash_ipportip.c | 3 +++ net/netfilter/ipset/ip_set_hash_ipportnet.c | 3 +++ net/netfilter/ipset/ip_set_hash_net.c | 11 ++++++++++- net/netfilter/ipset/ip_set_hash_netiface.c | 10 +++++++++- net/netfilter/ipset/ip_set_hash_netnet.c | 16 +++++++++++++++- net/netfilter/ipset/ip_set_hash_netport.c | 11 ++++++++++- net/netfilter/ipset/ip_set_hash_netportnet.c | 16 +++++++++++++++- 11 files changed, 88 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 10279c4830ac..ada1296c87d5 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -196,6 +196,9 @@ struct ip_set_region { u32 elements; /* Number of elements vs timeout */ }; +/* Max range where every element is added/deleted in one step */ +#define IPSET_MAX_RANGE (1<<20) + /* The max revision number supported by any set type + 1 */ #define IPSET_REVISION_MAX 9 diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index d1bef23fd4f5..dd30c03d5a23 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -132,8 +132,11 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; - if (ip > ip_to) + if (ip > ip_to) { + if (ip_to == 0) + return -IPSET_ERR_HASH_ELEM; swap(ip, ip_to); + } } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); @@ -144,6 +147,10 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1); + /* 64bit division is not allowed on 32bit */ + if (((u64)ip_to - ip + 1) >> (32 - h->netmask) > IPSET_MAX_RANGE) + return -ERANGE; + if (retried) { ip = ntohl(h->next.ip); e.ip = htonl(ip); diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 18346d18aa16..153de3457423 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -121,6 +121,8 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK])); e.mark &= h->markmask; + if (e.mark == 0 && e.ip == 0) + return -IPSET_ERR_HASH_ELEM; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) { @@ -133,8 +135,11 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; - if (ip > ip_to) + if (ip > ip_to) { + if (e.mark == 0 && ip_to == 0) + return -IPSET_ERR_HASH_ELEM; swap(ip, ip_to); + } } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); @@ -143,6 +148,9 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], ip_set_mask_from_to(ip, ip_to, cidr); } + if (((u64)ip_to - ip + 1) > IPSET_MAX_RANGE) + return -ERANGE; + if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index e1ca11196515..7303138e46be 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -173,6 +173,9 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } + if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) + return -ERANGE; + if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index ab179e064597..334fb1ad0e86 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -180,6 +180,9 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } + if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) + return -ERANGE; + if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 8f075b44cf64..7df94f437f60 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -253,6 +253,9 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } + if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) + return -ERANGE; + ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index c1a11f041ac6..1422739d9aa2 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -140,7 +140,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net4_elem e = { .cidr = HOST_MASK }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0; + u32 ip = 0, ip_to = 0, ipn, n = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -188,6 +188,15 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; } + ipn = ip; + do { + ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr); + n++; + } while (ipn++ < ip_to); + + if (n > IPSET_MAX_RANGE) + return -ERANGE; + if (retried) ip = ntohl(h->next.ip); do { diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index ddd51c2e1cb3..9810f5bf63f5 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -202,7 +202,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0; + u32 ip = 0, ip_to = 0, ipn, n = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -256,6 +256,14 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip, ip_to, e.cidr); } + ipn = ip; + do { + ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr); + n++; + } while (ipn++ < ip_to); + + if (n > IPSET_MAX_RANGE) + return -ERANGE; if (retried) ip = ntohl(h->next.ip); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 6532f0505e66..3d09eefe998a 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -168,7 +168,8 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_netnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0; - u32 ip2 = 0, ip2_from = 0, ip2_to = 0; + u32 ip2 = 0, ip2_from = 0, ip2_to = 0, ipn; + u64 n = 0, m = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -244,6 +245,19 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); } + ipn = ip; + do { + ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr[0]); + n++; + } while (ipn++ < ip_to); + ipn = ip2_from; + do { + ipn = ip_set_range_to_cidr(ipn, ip2_to, &e.cidr[1]); + m++; + } while (ipn++ < ip2_to); + + if (n*m > IPSET_MAX_RANGE) + return -ERANGE; if (retried) { ip = ntohl(h->next.ip[0]); diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index ec1564a1cb5a..09cf72eb37f8 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -158,7 +158,8 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 port, port_to, p = 0, ip = 0, ip_to = 0; + u32 port, port_to, p = 0, ip = 0, ip_to = 0, ipn; + u64 n = 0; bool with_ports = false; u8 cidr; int ret; @@ -235,6 +236,14 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip, ip_to, e.cidr + 1); } + ipn = ip; + do { + ipn = ip_set_range_to_cidr(ipn, ip_to, &cidr); + n++; + } while (ipn++ < ip_to); + + if (n*(port_to - port + 1) > IPSET_MAX_RANGE) + return -ERANGE; if (retried) { ip = ntohl(h->next.ip); diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 0e91d1e82f1c..19bcdb3141f6 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -182,7 +182,8 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_netportnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, p = 0, port, port_to; - u32 ip2_from = 0, ip2_to = 0, ip2; + u32 ip2_from = 0, ip2_to = 0, ip2, ipn; + u64 n = 0, m = 0; bool with_ports = false; int ret; @@ -284,6 +285,19 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); } + ipn = ip; + do { + ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr[0]); + n++; + } while (ipn++ < ip_to); + ipn = ip2_from; + do { + ipn = ip_set_range_to_cidr(ipn, ip2_to, &e.cidr[1]); + m++; + } while (ipn++ < ip2_to); + + if (n*m*(port_to - port + 1) > IPSET_MAX_RANGE) + return -ERANGE; if (retried) { ip = ntohl(h->next.ip[0]); -- cgit From 1027b96ec9d34f9abab69bc1a4dc5b1ad8ab1349 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 6 Aug 2021 16:21:24 +0800 Subject: once: Fix panic when module unload DO_ONCE DEFINE_STATIC_KEY_TRUE(___once_key); __do_once_done once_disable_jump(once_key); INIT_WORK(&w->work, once_deferred); struct once_work *w; w->key = key; schedule_work(&w->work); module unload //*the key is destroy* process_one_work once_deferred BUG_ON(!static_key_enabled(work->key)); static_key_count((struct static_key *)x) //*access key, crash* When module uses DO_ONCE mechanism, it could crash due to the above concurrency problem, we could reproduce it with link[1]. Fix it by add/put module refcount in the once work process. [1] https://lore.kernel.org/netdev/eaa6c371-465e-57eb-6be9-f4b16b9d7cbf@huawei.com/ Cc: Hannes Frederic Sowa Cc: Daniel Borkmann Cc: David S. Miller Cc: Eric Dumazet Reported-by: Minmin chen Signed-off-by: Kefeng Wang Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- include/linux/once.h | 4 ++-- lib/once.c | 11 ++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/once.h b/include/linux/once.h index 9225ee6d96c7..ae6f4eb41cbe 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -7,7 +7,7 @@ bool __do_once_start(bool *done, unsigned long *flags); void __do_once_done(bool *done, struct static_key_true *once_key, - unsigned long *flags); + unsigned long *flags, struct module *mod); /* Call a function exactly once. The idea of DO_ONCE() is to perform * a function call such as initialization of random seeds, etc, only @@ -46,7 +46,7 @@ void __do_once_done(bool *done, struct static_key_true *once_key, if (unlikely(___ret)) { \ func(__VA_ARGS__); \ __do_once_done(&___done, &___once_key, \ - &___flags); \ + &___flags, THIS_MODULE); \ } \ } \ ___ret; \ diff --git a/lib/once.c b/lib/once.c index 8b7d6235217e..59149bf3bfb4 100644 --- a/lib/once.c +++ b/lib/once.c @@ -3,10 +3,12 @@ #include #include #include +#include struct once_work { struct work_struct work; struct static_key_true *key; + struct module *module; }; static void once_deferred(struct work_struct *w) @@ -16,10 +18,11 @@ static void once_deferred(struct work_struct *w) work = container_of(w, struct once_work, work); BUG_ON(!static_key_enabled(work->key)); static_branch_disable(work->key); + module_put(work->module); kfree(work); } -static void once_disable_jump(struct static_key_true *key) +static void once_disable_jump(struct static_key_true *key, struct module *mod) { struct once_work *w; @@ -29,6 +32,8 @@ static void once_disable_jump(struct static_key_true *key) INIT_WORK(&w->work, once_deferred); w->key = key; + w->module = mod; + __module_get(mod); schedule_work(&w->work); } @@ -53,11 +58,11 @@ bool __do_once_start(bool *done, unsigned long *flags) EXPORT_SYMBOL(__do_once_start); void __do_once_done(bool *done, struct static_key_true *once_key, - unsigned long *flags) + unsigned long *flags, struct module *mod) __releases(once_lock) { *done = true; spin_unlock_irqrestore(&once_lock, *flags); - once_disable_jump(once_key); + once_disable_jump(once_key, mod); } EXPORT_SYMBOL(__do_once_done); -- cgit From 71330842ff93ae67a066c1fa68d75672527312fa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 9 Aug 2021 21:45:32 +0200 Subject: bpf: Add _kernel suffix to internal lockdown_bpf_read Rename LOCKDOWN_BPF_READ into LOCKDOWN_BPF_READ_KERNEL so we have naming more consistent with a LOCKDOWN_BPF_WRITE_USER option that we are adding. Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko --- include/linux/security.h | 2 +- kernel/bpf/helpers.c | 4 ++-- kernel/trace/bpf_trace.c | 8 ++++---- security/security.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 24eda04221e9..724d7a4a0c91 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -123,7 +123,7 @@ enum lockdown_reason { LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, LOCKDOWN_KPROBES, - LOCKDOWN_BPF_READ, + LOCKDOWN_BPF_READ_KERNEL, LOCKDOWN_PERF, LOCKDOWN_TRACEFS, LOCKDOWN_XMON_RW, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 62cf00383910..0b04553e8c44 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1070,12 +1070,12 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: - return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_proto; case BPF_FUNC_probe_read_user_str: return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: - return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_str_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b4916ef388ad..1836591197a5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -999,19 +999,19 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: - return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_proto; case BPF_FUNC_probe_read_user_str: return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: - return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_str_proto; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE case BPF_FUNC_probe_read: - return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_compat_proto; case BPF_FUNC_probe_read_str: - return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_compat_str_proto; #endif #ifdef CONFIG_CGROUPS diff --git a/security/security.c b/security/security.c index 09533cbb7221..6b83ab4e9d66 100644 --- a/security/security.c +++ b/security/security.c @@ -61,7 +61,7 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", - [LOCKDOWN_BPF_READ] = "use of bpf to read kernel RAM", + [LOCKDOWN_BPF_READ_KERNEL] = "use of bpf to read kernel RAM", [LOCKDOWN_PERF] = "unsafe use of perf", [LOCKDOWN_TRACEFS] = "use of tracefs", [LOCKDOWN_XMON_RW] = "xmon read and write access", -- cgit From 563476ae0c5e48a028cbfa38fa9d2fc0418eb88f Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Sun, 11 Apr 2021 15:32:55 +0300 Subject: net/mlx5: Synchronize correct IRQ when destroying CQ The CQ destroy is performed based on the IRQ number that is stored in cq->irqn. That number wasn't set explicitly during CQ creation and as expected some of the API users of mlx5_core_create_cq() forgot to update it. This caused to wrong synchronization call of the wrong IRQ with a number 0 instead of the real one. As a fix, set the IRQ number directly in the mlx5_core_create_cq() and update all users accordingly. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Fixes: ef1659ade359 ("IB/mlx5: Add DEVX support for CQ events") Signed-off-by: Shay Drory Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/infiniband/hw/mlx5/cq.c | 4 +--- drivers/infiniband/hw/mlx5/devx.c | 3 +-- drivers/net/ethernet/mellanox/mlx5/core/cq.c | 1 + drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 13 ++----------- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 20 ++++++++++++++++---- drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c | 4 +--- drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h | 2 ++ .../ethernet/mellanox/mlx5/core/steering/dr_send.c | 4 +--- drivers/vdpa/mlx5/net/mlx5_vnet.c | 3 +-- include/linux/mlx5/driver.h | 3 +-- 10 files changed, 27 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 7abeb576b3c5..b8e5e371bb19 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -945,7 +945,6 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, u32 *cqb = NULL; void *cqc; int cqe_size; - unsigned int irqn; int eqn; int err; @@ -984,7 +983,7 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, INIT_WORK(&cq->notify_work, notify_soft_wc_handler); } - err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn); + err = mlx5_vector2eqn(dev->mdev, vector, &eqn); if (err) goto err_cqb; @@ -1007,7 +1006,6 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, goto err_cqb; mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn); - cq->mcq.irqn = irqn; if (udata) cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp; else diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index eb9b0a2707f8..c869b2a91a28 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -975,7 +975,6 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_EQN)( struct mlx5_ib_dev *dev; int user_vector; int dev_eqn; - unsigned int irqn; int err; if (uverbs_copy_from(&user_vector, attrs, @@ -987,7 +986,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_EQN)( return PTR_ERR(c); dev = to_mdev(c->ibucontext.device); - err = mlx5_vector2eqn(dev->mdev, user_vector, &dev_eqn, &irqn); + err = mlx5_vector2eqn(dev->mdev, user_vector, &dev_eqn); if (err < 0) return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index df3e4938ecdd..360e093874d4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -134,6 +134,7 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, cq->cqn); cq->uar = dev->priv.uar; + cq->irqn = eq->core.irqn; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index fd250f7bcd88..24f919ef9b8e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1535,15 +1535,9 @@ static int mlx5e_alloc_cq_common(struct mlx5e_priv *priv, { struct mlx5_core_dev *mdev = priv->mdev; struct mlx5_core_cq *mcq = &cq->mcq; - int eqn_not_used; - unsigned int irqn; int err; u32 i; - err = mlx5_vector2eqn(mdev, param->eq_ix, &eqn_not_used, &irqn); - if (err) - return err; - err = mlx5_cqwq_create(mdev, ¶m->wq, param->cqc, &cq->wq, &cq->wq_ctrl); if (err) @@ -1557,7 +1551,6 @@ static int mlx5e_alloc_cq_common(struct mlx5e_priv *priv, mcq->vector = param->eq_ix; mcq->comp = mlx5e_completion_event; mcq->event = mlx5e_cq_error_event; - mcq->irqn = irqn; for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) { struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i); @@ -1605,11 +1598,10 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param) void *in; void *cqc; int inlen; - unsigned int irqn_not_used; int eqn; int err; - err = mlx5_vector2eqn(mdev, param->eq_ix, &eqn, &irqn_not_used); + err = mlx5_vector2eqn(mdev, param->eq_ix, &eqn); if (err) return err; @@ -1983,9 +1975,8 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, struct mlx5e_channel *c; unsigned int irq; int err; - int eqn; - err = mlx5_vector2eqn(priv->mdev, ix, &eqn, &irq); + err = mlx5_vector2irqn(priv->mdev, ix, &irq); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 6e074cc457de..605c8ecc3610 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -855,8 +855,8 @@ clean: return err; } -int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, - unsigned int *irqn) +static int vector2eqnirqn(struct mlx5_core_dev *dev, int vector, int *eqn, + unsigned int *irqn) { struct mlx5_eq_table *table = dev->priv.eq_table; struct mlx5_eq_comp *eq, *n; @@ -865,8 +865,10 @@ int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) { if (i++ == vector) { - *eqn = eq->core.eqn; - *irqn = eq->core.irqn; + if (irqn) + *irqn = eq->core.irqn; + if (eqn) + *eqn = eq->core.eqn; err = 0; break; } @@ -874,8 +876,18 @@ int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, return err; } + +int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn) +{ + return vector2eqnirqn(dev, vector, eqn, NULL); +} EXPORT_SYMBOL(mlx5_vector2eqn); +int mlx5_vector2irqn(struct mlx5_core_dev *dev, int vector, unsigned int *irqn) +{ + return vector2eqnirqn(dev, vector, NULL, irqn); +} + unsigned int mlx5_comp_vectors_count(struct mlx5_core_dev *dev) { return dev->priv.eq_table->num_comp_eqs; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c index bd66ab2af5b5..d5da4ab65766 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -417,7 +417,6 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) struct mlx5_wq_param wqp; struct mlx5_cqe64 *cqe; int inlen, err, eqn; - unsigned int irqn; void *cqc, *in; __be64 *pas; u32 i; @@ -446,7 +445,7 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) goto err_cqwq; } - err = mlx5_vector2eqn(mdev, smp_processor_id(), &eqn, &irqn); + err = mlx5_vector2eqn(mdev, smp_processor_id(), &eqn); if (err) { kvfree(in); goto err_cqwq; @@ -476,7 +475,6 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) *conn->cq.mcq.arm_db = 0; conn->cq.mcq.vector = 0; conn->cq.mcq.comp = mlx5_fpga_conn_cq_complete; - conn->cq.mcq.irqn = irqn; conn->cq.mcq.uar = fdev->conn_res.uar; tasklet_setup(&conn->cq.tasklet, mlx5_fpga_conn_cq_tasklet); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h index 624cedebb510..d3d628b862f3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h @@ -104,4 +104,6 @@ void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev); struct cpu_rmap *mlx5_eq_table_get_rmap(struct mlx5_core_dev *dev); #endif +int mlx5_vector2irqn(struct mlx5_core_dev *dev, int vector, unsigned int *irqn); + #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c index 12cf323a5943..9df0e73d1c35 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -749,7 +749,6 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, struct mlx5_cqe64 *cqe; struct mlx5dr_cq *cq; int inlen, err, eqn; - unsigned int irqn; void *cqc, *in; __be64 *pas; int vector; @@ -782,7 +781,7 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, goto err_cqwq; vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev); - err = mlx5_vector2eqn(mdev, vector, &eqn, &irqn); + err = mlx5_vector2eqn(mdev, vector, &eqn); if (err) { kvfree(in); goto err_cqwq; @@ -818,7 +817,6 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, *cq->mcq.arm_db = cpu_to_be32(2 << 28); cq->mcq.vector = 0; - cq->mcq.irqn = irqn; cq->mcq.uar = uar; return cq; diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 2a31467f7ac5..379a19144a25 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -526,7 +526,6 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent) void __iomem *uar_page = ndev->mvdev.res.uar->map; u32 out[MLX5_ST_SZ_DW(create_cq_out)]; struct mlx5_vdpa_cq *vcq = &mvq->cq; - unsigned int irqn; __be64 *pas; int inlen; void *cqc; @@ -566,7 +565,7 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent) /* Use vector 0 by default. Consider adding code to choose least used * vector. */ - err = mlx5_vector2eqn(mdev, 0, &eqn, &irqn); + err = mlx5_vector2eqn(mdev, 0, &eqn); if (err) goto err_vec; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1efe37466969..25a8be58d289 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1044,8 +1044,7 @@ void mlx5_unregister_debugfs(void); void mlx5_fill_page_array(struct mlx5_frag_buf *buf, __be64 *pas); void mlx5_fill_page_frag_array_perm(struct mlx5_frag_buf *buf, __be64 *pas, u8 perm); void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas); -int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, - unsigned int *irqn); +int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn); int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn); int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn); -- cgit From 51e1bb9eeaf7868db56e58f47848e364ab4c4129 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 9 Aug 2021 12:43:17 +0200 Subject: bpf: Add lockdown check for probe_write_user helper Back then, commit 96ae52279594 ("bpf: Add bpf_probe_write_user BPF helper to be called in tracers") added the bpf_probe_write_user() helper in order to allow to override user space memory. Its original goal was to have a facility to "debug, divert, and manipulate execution of semi-cooperative processes" under CAP_SYS_ADMIN. Write to kernel was explicitly disallowed since it would otherwise tamper with its integrity. One use case was shown in cf9b1199de27 ("samples/bpf: Add test/example of using bpf_probe_write_user bpf helper") where the program DNATs traffic at the time of connect(2) syscall, meaning, it rewrites the arguments to a syscall while they're still in userspace, and before the syscall has a chance to copy the argument into kernel space. These days we have better mechanisms in BPF for achieving the same (e.g. for load-balancers), but without having to write to userspace memory. Of course the bpf_probe_write_user() helper can also be used to abuse many other things for both good or bad purpose. Outside of BPF, there is a similar mechanism for ptrace(2) such as PTRACE_PEEK{TEXT,DATA} and PTRACE_POKE{TEXT,DATA}, but would likely require some more effort. Commit 96ae52279594 explicitly dedicated the helper for experimentation purpose only. Thus, move the helper's availability behind a newly added LOCKDOWN_BPF_WRITE_USER lockdown knob so that the helper is disabled under the "integrity" mode. More fine-grained control can be implemented also from LSM side with this change. Fixes: 96ae52279594 ("bpf: Add bpf_probe_write_user BPF helper to be called in tracers") Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko --- include/linux/security.h | 1 + kernel/trace/bpf_trace.c | 5 +++-- security/security.c | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 724d7a4a0c91..5b7288521300 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -120,6 +120,7 @@ enum lockdown_reason { LOCKDOWN_MMIOTRACE, LOCKDOWN_DEBUGFS, LOCKDOWN_XMON_WR, + LOCKDOWN_BPF_WRITE_USER, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, LOCKDOWN_KPROBES, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1836591197a5..fdd14072fc3b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -990,12 +990,13 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_numa_node_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; - case BPF_FUNC_probe_write_user: - return bpf_get_probe_write_proto(); case BPF_FUNC_current_task_under_cgroup: return &bpf_current_task_under_cgroup_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; + case BPF_FUNC_probe_write_user: + return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ? + NULL : bpf_get_probe_write_proto(); case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: diff --git a/security/security.c b/security/security.c index 6b83ab4e9d66..9ffa9e9c5c55 100644 --- a/security/security.c +++ b/security/security.c @@ -58,6 +58,7 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_MMIOTRACE] = "unsafe mmio", [LOCKDOWN_DEBUGFS] = "debugfs access", [LOCKDOWN_XMON_WR] = "xmon write access", + [LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", -- cgit From a2baf4e8bb0f306fbed7b5e6197c02896a638ab5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 9 Aug 2021 18:04:13 -0700 Subject: bpf: Fix potentially incorrect results with bpf_get_local_storage() Commit b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") fixed a bug for bpf_get_local_storage() helper so different tasks won't mess up with each other's percpu local storage. The percpu data contains 8 slots so it can hold up to 8 contexts (same or different tasks), for 8 different program runs, at the same time. This in general is sufficient. But our internal testing showed the following warning multiple times: [...] warning: WARNING: CPU: 13 PID: 41661 at include/linux/bpf-cgroup.h:193 __cgroup_bpf_run_filter_sock_ops+0x13e/0x180 RIP: 0010:__cgroup_bpf_run_filter_sock_ops+0x13e/0x180 tcp_call_bpf.constprop.99+0x93/0xc0 tcp_conn_request+0x41e/0xa50 ? tcp_rcv_state_process+0x203/0xe00 tcp_rcv_state_process+0x203/0xe00 ? sk_filter_trim_cap+0xbc/0x210 ? tcp_v6_inbound_md5_hash.constprop.41+0x44/0x160 tcp_v6_do_rcv+0x181/0x3e0 tcp_v6_rcv+0xc65/0xcb0 ip6_protocol_deliver_rcu+0xbd/0x450 ip6_input_finish+0x11/0x20 ip6_input+0xb5/0xc0 ip6_sublist_rcv_finish+0x37/0x50 ip6_sublist_rcv+0x1dc/0x270 ipv6_list_rcv+0x113/0x140 __netif_receive_skb_list_core+0x1a0/0x210 netif_receive_skb_list_internal+0x186/0x2a0 gro_normal_list.part.170+0x19/0x40 napi_complete_done+0x65/0x150 mlx5e_napi_poll+0x1ae/0x680 __napi_poll+0x25/0x120 net_rx_action+0x11e/0x280 __do_softirq+0xbb/0x271 irq_exit_rcu+0x97/0xa0 common_interrupt+0x7f/0xa0 asm_common_interrupt+0x1e/0x40 RIP: 0010:bpf_prog_1835a9241238291a_tw_egress+0x5/0xbac ? __cgroup_bpf_run_filter_skb+0x378/0x4e0 ? do_softirq+0x34/0x70 ? ip6_finish_output2+0x266/0x590 ? ip6_finish_output+0x66/0xa0 ? ip6_output+0x6c/0x130 ? ip6_xmit+0x279/0x550 ? ip6_dst_check+0x61/0xd0 [...] Using drgn [0] to dump the percpu buffer contents showed that on this CPU slot 0 is still available, but slots 1-7 are occupied and those tasks in slots 1-7 mostly don't exist any more. So we might have issues in bpf_cgroup_storage_unset(). Further debugging confirmed that there is a bug in bpf_cgroup_storage_unset(). Currently, it tries to unset "current" slot with searching from the start. So the following sequence is possible: 1. A task is running and claims slot 0 2. Running BPF program is done, and it checked slot 0 has the "task" and ready to reset it to NULL (not yet). 3. An interrupt happens, another BPF program runs and it claims slot 1 with the *same* task. 4. The unset() in interrupt context releases slot 0 since it matches "task". 5. Interrupt is done, the task in process context reset slot 0. At the end, slot 1 is not reset and the same process can continue to occupy slots 2-7 and finally, when the above step 1-5 is repeated again, step 3 BPF program won't be able to claim an empty slot and a warning will be issued. To fix the issue, for unset() function, we should traverse from the last slot to the first. This way, the above issue can be avoided. The same reverse traversal should also be done in bpf_get_local_storage() helper itself. Otherwise, incorrect local storage may be returned to BPF program. [0] https://github.com/osandov/drgn Fixes: b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210810010413.1976277-1-yhs@fb.com --- include/linux/bpf-cgroup.h | 4 ++-- kernel/bpf/helpers.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 8b77d08d4b47..6c9b10d82c80 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -201,8 +201,8 @@ static inline void bpf_cgroup_storage_unset(void) { int i; - for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { - if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) + for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) { + if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) continue; this_cpu_write(bpf_cgroup_storage_info[i].task, NULL); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0b04553e8c44..7a97b2f4747d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -397,8 +397,8 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) void *ptr; int i; - for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { - if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) + for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) { + if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) continue; storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]); -- cgit From 77e89afc25f30abd56e76a809ee2884d7c1b63ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jul 2021 23:51:47 +0200 Subject: PCI/MSI: Protect msi_desc::masked for multi-MSI Multi-MSI uses a single MSI descriptor and there is a single mask register when the device supports per vector masking. To avoid reading back the mask register the value is cached in the MSI descriptor and updates are done by clearing and setting bits in the cache and writing it to the device. But nothing protects msi_desc::masked and the mask register from being modified concurrently on two different CPUs for two different Linux interrupts which belong to the same multi-MSI descriptor. Add a lock to struct device and protect any operation on the mask and the mask register with it. This makes the update of msi_desc::masked unconditional, but there is no place which requires a modification of the hardware register without updating the masked cache. msi_mask_irq() is now an empty wrapper which will be cleaned up in follow up changes. The problem goes way back to the initial support of multi-MSI, but picking the commit which introduced the mask cache is a valid cut off point (2.6.30). Fixes: f2440d9acbe8 ("PCI MSI: Refactor interrupt masking code") Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.726833414@linutronix.de --- drivers/base/core.c | 1 + drivers/pci/msi.c | 19 ++++++++++--------- include/linux/device.h | 1 + include/linux/msi.h | 2 +- 4 files changed, 13 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/core.c b/drivers/base/core.c index f6360490a4a3..6c0ef9d55a34 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2837,6 +2837,7 @@ void device_initialize(struct device *dev) device_pm_init(dev); set_dev_node(dev, -1); #ifdef CONFIG_GENERIC_MSI_IRQ + raw_spin_lock_init(&dev->msi_lock); INIT_LIST_HEAD(&dev->msi_list); #endif INIT_LIST_HEAD(&dev->links.consumers); diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index f0f7026b7ac0..e5e75331b415 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -143,24 +143,25 @@ static inline __attribute_const__ u32 msi_mask(unsigned x) * reliably as devices without an INTx disable bit will then generate a * level IRQ which will never be cleared. */ -u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) +void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) { - u32 mask_bits = desc->masked; + raw_spinlock_t *lock = &desc->dev->msi_lock; + unsigned long flags; if (pci_msi_ignore_mask || !desc->msi_attrib.maskbit) - return 0; + return; - mask_bits &= ~mask; - mask_bits |= flag; + raw_spin_lock_irqsave(lock, flags); + desc->masked &= ~mask; + desc->masked |= flag; pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->mask_pos, - mask_bits); - - return mask_bits; + desc->masked); + raw_spin_unlock_irqrestore(lock, flags); } static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) { - desc->masked = __pci_msi_desc_mask_irq(desc, mask, flag); + __pci_msi_desc_mask_irq(desc, mask, flag); } static void __iomem *pci_msix_desc_addr(struct msi_desc *desc) diff --git a/include/linux/device.h b/include/linux/device.h index 59940f1744c1..e53aa5065f58 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -506,6 +506,7 @@ struct device { struct dev_pin_info *pins; #endif #ifdef CONFIG_GENERIC_MSI_IRQ + raw_spinlock_t msi_lock; struct list_head msi_list; #endif #ifdef CONFIG_DMA_OPS diff --git a/include/linux/msi.h b/include/linux/msi.h index 6aff469e511d..e8bdcb83172b 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -233,7 +233,7 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag); -u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag); +void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag); void pci_msi_mask_irq(struct irq_data *data); void pci_msi_unmask_irq(struct irq_data *data); -- cgit From 826da771291fc25a428e871f9e7fb465e390f852 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 29 Jul 2021 23:51:48 +0200 Subject: genirq: Provide IRQCHIP_AFFINITY_PRE_STARTUP X86 IO/APIC and MSI interrupts (when used without interrupts remapping) require that the affinity setup on startup is done before the interrupt is enabled for the first time as the non-remapped operation mode cannot safely migrate enabled interrupts from arbitrary contexts. Provide a new irq chip flag which allows affected hardware to request this. This has to be opt-in because there have been reports in the past that some interrupt chips cannot handle affinity setting before startup. Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)") Signed-off-by: Thomas Gleixner Tested-by: Marc Zyngier Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.779791738@linutronix.de --- include/linux/irq.h | 2 ++ kernel/irq/chip.c | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 8e9a9ae471a6..c8293c817646 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -569,6 +569,7 @@ struct irq_chip { * IRQCHIP_SUPPORTS_NMI: Chip can deliver NMIs, only for root irqchips * IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND: Invokes __enable_irq()/__disable_irq() for wake irqs * in the suspend path if they are in disabled state + * IRQCHIP_AFFINITY_PRE_STARTUP: Default affinity update before startup */ enum { IRQCHIP_SET_TYPE_MASKED = (1 << 0), @@ -581,6 +582,7 @@ enum { IRQCHIP_SUPPORTS_LEVEL_MSI = (1 << 7), IRQCHIP_SUPPORTS_NMI = (1 << 8), IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND = (1 << 9), + IRQCHIP_AFFINITY_PRE_STARTUP = (1 << 10), }; #include diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 7f04c7d8296e..a98bcfc4be7b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -265,8 +265,11 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) } else { switch (__irq_startup_managed(desc, aff, force)) { case IRQ_STARTUP_NORMAL: + if (d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP) + irq_setup_affinity(desc); ret = __irq_startup(desc); - irq_setup_affinity(desc); + if (!(d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP)) + irq_setup_affinity(desc); break; case IRQ_STARTUP_MANAGED: irq_do_set_affinity(d, aff, false); -- cgit From 0e566c8f0f2e8325e35f6f97e13cde5356b41814 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 21 Jul 2021 17:26:47 +0300 Subject: virtio: Protect vqs list access VQs may be accessed to mark the device broken while they are created/destroyed. Hence protect the access to the vqs list. Fixes: e2dcdfe95c0b ("virtio: virtio_break_device() to mark all virtqueues broken.") Signed-off-by: Parav Pandit Link: https://lore.kernel.org/r/20210721142648.1525924-4-parav@nvidia.com Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio.c | 1 + drivers/virtio/virtio_ring.c | 8 ++++++++ include/linux/virtio.h | 1 + 3 files changed, 10 insertions(+) (limited to 'include/linux') diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 4b15c00c0a0a..49984d2cba24 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -355,6 +355,7 @@ int register_virtio_device(struct virtio_device *dev) virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); INIT_LIST_HEAD(&dev->vqs); + spin_lock_init(&dev->vqs_list_lock); /* * device_add() causes the bus infrastructure to look for a matching diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index d5934c2e5a89..c2aaa0eff6df 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -1755,7 +1755,9 @@ static struct virtqueue *vring_create_virtqueue_packed( cpu_to_le16(vq->packed.event_flags_shadow); } + spin_lock(&vdev->vqs_list_lock); list_add_tail(&vq->vq.list, &vdev->vqs); + spin_unlock(&vdev->vqs_list_lock); return &vq->vq; err_desc_extra: @@ -2229,7 +2231,9 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, memset(vq->split.desc_state, 0, vring.num * sizeof(struct vring_desc_state_split)); + spin_lock(&vdev->vqs_list_lock); list_add_tail(&vq->vq.list, &vdev->vqs); + spin_unlock(&vdev->vqs_list_lock); return &vq->vq; err_extra: @@ -2291,7 +2295,9 @@ void vring_del_virtqueue(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); + spin_lock(&vq->vq.vdev->vqs_list_lock); list_del(&_vq->list); + spin_unlock(&vq->vq.vdev->vqs_list_lock); if (vq->we_own_ring) { if (vq->packed_ring) { @@ -2386,12 +2392,14 @@ void virtio_break_device(struct virtio_device *dev) { struct virtqueue *_vq; + spin_lock(&dev->vqs_list_lock); list_for_each_entry(_vq, &dev->vqs, list) { struct vring_virtqueue *vq = to_vvq(_vq); /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ WRITE_ONCE(vq->broken, true); } + spin_unlock(&dev->vqs_list_lock); } EXPORT_SYMBOL_GPL(virtio_break_device); diff --git a/include/linux/virtio.h b/include/linux/virtio.h index b1894e0323fa..41edbc01ffa4 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -110,6 +110,7 @@ struct virtio_device { bool config_enabled; bool config_change_pending; spinlock_t config_lock; + spinlock_t vqs_list_lock; /* Protects VQs list access */ struct device dev; struct virtio_device_id id; const struct virtio_config_ops *config; -- cgit From c8d182bd387a09a8b95303c8086238e8bf61fcfc Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Thu, 15 Jul 2021 16:00:26 +0800 Subject: vdpa: Add documentation for vdpa_alloc_device() macro The return value of vdpa_alloc_device() macro is not very clear, so that most of callers did the wrong check. Let's add some comments to better document it. Signed-off-by: Xie Yongji Link: https://lore.kernel.org/r/20210715080026.242-4-xieyongji@bytedance.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Reviewed-by: Stefano Garzarella --- include/linux/vdpa.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 3357ac98878d..8cfe49d201dd 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -277,6 +277,17 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, size_t size, const char *name); +/** + * vdpa_alloc_device - allocate and initilaize a vDPA device + * + * @dev_struct: the type of the parent structure + * @member: the name of struct vdpa_device within the @dev_struct + * @parent: the parent device + * @config: the bus operations that is supported by this device + * @name: name of the vdpa device + * + * Return allocated data structure or ERR_PTR upon error + */ #define vdpa_alloc_device(dev_struct, member, parent, config, name) \ container_of(__vdpa_alloc_device( \ parent, config, \ -- cgit From ea2f6af16532511eb1cd8eb62845c37861f24ce8 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 10 Aug 2021 12:25:05 -0400 Subject: vringh: pull in spinlock header we use a spinlock now pull in the correct header to make vring.h self sufficient. Signed-off-by: Michael S. Tsirkin --- include/linux/vringh.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/vringh.h b/include/linux/vringh.h index 84db7b8f912f..212892cf9822 100644 --- a/include/linux/vringh.h +++ b/include/linux/vringh.h @@ -14,6 +14,7 @@ #include #include #include +#include #if IS_REACHABLE(CONFIG_VHOST_IOTLB) #include #include -- cgit From 879753c816dbbdb2a9a395aa4448d29feee92d1a Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 11 Aug 2021 08:37:59 +0300 Subject: vdpa/mlx5: Fix queue type selection logic get_queue_type() comments that splict virtqueue is preferred, however, the actual logic preferred packed virtqueues. Since firmware has not supported packed virtqueues we ended up using split virtqueues as was desired. Since we do not advertise support for packed virtqueues, we add a check to verify split virtqueues are indeed supported. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Signed-off-by: Eli Cohen Link: https://lore.kernel.org/r/20210811053759.66752-1-elic@nvidia.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 14 ++++++++++---- include/linux/mlx5/mlx5_ifc_vdpa.h | 10 ++++++---- 2 files changed, 16 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 2a31467f7ac5..b1230fa2f5d1 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -753,12 +753,12 @@ static int get_queue_type(struct mlx5_vdpa_net *ndev) type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type); /* prefer split queue */ - if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED) - return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED; + if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT) + return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT; - WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)); + WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED)); - return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT; + return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED; } static bool vq_is_tx(u16 idx) @@ -2030,6 +2030,12 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name) return -ENOSPC; mdev = mgtdev->madev->mdev; + if (!(MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_queue_type) & + MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)) { + dev_warn(mdev->device, "missing support for split virtqueues\n"); + return -EOPNOTSUPP; + } + /* we save one virtqueue for control virtqueue should we require it */ max_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues); max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS); diff --git a/include/linux/mlx5/mlx5_ifc_vdpa.h b/include/linux/mlx5/mlx5_ifc_vdpa.h index 98b56b75c625..1a9c9d94cb59 100644 --- a/include/linux/mlx5/mlx5_ifc_vdpa.h +++ b/include/linux/mlx5/mlx5_ifc_vdpa.h @@ -11,13 +11,15 @@ enum { }; enum { - MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT = 0x1, // do I check this caps? - MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED = 0x2, + MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT = 0, + MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED = 1, }; enum { - MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT = 0, - MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED = 1, + MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT = + BIT(MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT), + MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED = + BIT(MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED), }; struct mlx5_ifc_virtio_q_bits { -- cgit From b69dd5b3780a7298bd893816a09da751bc0636f7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Aug 2021 12:57:15 -0700 Subject: net: igmp: increase size of mr_ifc_count Some arches support cmpxchg() on 4-byte and 8-byte only. Increase mr_ifc_count width to 32bit to fix this problem. Fixes: 4a2b285e7e10 ("net: igmp: fix data-race in igmp_ifc_timer_expire()") Signed-off-by: Eric Dumazet Reported-by: Guenter Roeck Link: https://lore.kernel.org/r/20210811195715.3684218-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/inetdevice.h | 2 +- net/ipv4/igmp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 53aa0343bf69..aaf4f1b4c277 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -41,7 +41,7 @@ struct in_device { unsigned long mr_qri; /* Query Response Interval */ unsigned char mr_qrv; /* Query Robustness Variable */ unsigned char mr_gq_running; - unsigned char mr_ifc_count; + u32 mr_ifc_count; struct timer_list mr_gq_timer; /* general query timer */ struct timer_list mr_ifc_timer; /* interface change timer */ diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index a51360087b19..00576bae183d 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -803,7 +803,7 @@ static void igmp_gq_timer_expire(struct timer_list *t) static void igmp_ifc_timer_expire(struct timer_list *t) { struct in_device *in_dev = from_timer(in_dev, t, mr_ifc_timer); - u8 mr_ifc_count; + u32 mr_ifc_count; igmpv3_send_cr(in_dev); restart: -- cgit From 7a3dc4f35bf8e1a07e5c3f8ecc8ac923f48493fe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Aug 2021 12:36:14 +0200 Subject: driver core: Add missing kernel doc for device::msi_lock Fixes: 77e89afc25f3 ("PCI/MSI: Protect msi_desc::masked for multi-MSI") Reported-by: Stephen Rothwell Signed-off-by: Thomas Gleixner --- include/linux/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/device.h b/include/linux/device.h index e53aa5065f58..65d84b67b024 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -407,6 +407,7 @@ struct dev_links_info { * @em_pd: device's energy model performance domain * @pins: For device pin management. * See Documentation/driver-api/pin-control.rst for details. + * @msi_lock: Lock to protect MSI mask cache and mask register * @msi_list: Hosts MSI descriptors * @msi_domain: The generic MSI domain this device is using. * @numa_node: NUMA node this device is close to. -- cgit From 3b844826b6c6affa80755254da322b017358a2f4 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 5 Aug 2021 10:04:43 -0700 Subject: pipe: avoid unnecessary EPOLLET wakeups under normal loads I had forgotten just how sensitive hackbench is to extra pipe wakeups, and commit 3a34b13a88ca ("pipe: make pipe writes always wake up readers") ended up causing a quite noticeable regression on larger machines. Now, hackbench isn't necessarily a hugely meaningful benchmark, and it's not clear that this matters in real life all that much, but as Mel points out, it's used often enough when comparing kernels and so the performance regression shows up like a sore thumb. It's easy enough to fix at least for the common cases where pipes are used purely for data transfer, and you never have any exciting poll usage at all. So set a special 'poll_usage' flag when there is polling activity, and make the ugly "EPOLLET has crazy legacy expectations" semantics explicit to only that case. I would love to limit it to just the broken EPOLLET case, but the pipe code can't see the difference between epoll and regular select/poll, so any non-read/write waiting will trigger the extra wakeup behavior. That is sufficient for at least the hackbench case. Apart from making the odd extra wakeup cases more explicitly about EPOLLET, this also makes the extra wakeup be at the _end_ of the pipe write, not at the first write chunk. That is actually much saner semantics (as much as you can call any of the legacy edge-triggered expectations for EPOLLET "sane") since it means that you know the wakeup will happen once the write is done, rather than possibly in the middle of one. [ For stable people: I'm putting a "Fixes" tag on this, but I leave it up to you to decide whether you actually want to backport it or not. It likely has no impact outside of synthetic benchmarks - Linus ] Link: https://lore.kernel.org/lkml/20210802024945.GA8372@xsang-OptiPlex-9020/ Fixes: 3a34b13a88ca ("pipe: make pipe writes always wake up readers") Reported-by: kernel test robot Tested-by: Sandeep Patil Tested-by: Mel Gorman Signed-off-by: Linus Torvalds --- fs/pipe.c | 15 +++++++++------ include/linux/pipe_fs_i.h | 2 ++ 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/pipe.c b/fs/pipe.c index 8e6ef62aeb1c..678dee2a8228 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -444,9 +444,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) #endif /* - * Epoll nonsensically wants a wakeup whether the pipe - * was already empty or not. - * * If it wasn't empty we try to merge new data into * the last buffer. * @@ -455,9 +452,9 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) * spanning multiple pages. */ head = pipe->head; - was_empty = true; + was_empty = pipe_empty(head, pipe->tail); chars = total_len & (PAGE_SIZE-1); - if (chars && !pipe_empty(head, pipe->tail)) { + if (chars && !was_empty) { unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; int offset = buf->offset + buf->len; @@ -590,8 +587,11 @@ out: * This is particularly important for small writes, because of * how (for example) the GNU make jobserver uses small writes to * wake up pending jobs + * + * Epoll nonsensically wants a wakeup whether the pipe + * was already empty or not. */ - if (was_empty) { + if (was_empty || pipe->poll_usage) { wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } @@ -654,6 +654,9 @@ pipe_poll(struct file *filp, poll_table *wait) struct pipe_inode_info *pipe = filp->private_data; unsigned int head, tail; + /* Epoll has some historical nasty semantics, this enables them */ + pipe->poll_usage = 1; + /* * Reading pipe state only -- no need for acquiring the semaphore. * diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 5d2705f1d01c..fc5642431b92 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -48,6 +48,7 @@ struct pipe_buffer { * @files: number of struct file referring this pipe (protected by ->i_lock) * @r_counter: reader counter * @w_counter: writer counter + * @poll_usage: is this pipe used for epoll, which has crazy wakeups? * @fasync_readers: reader side fasync * @fasync_writers: writer side fasync * @bufs: the circular array of pipe buffers @@ -70,6 +71,7 @@ struct pipe_inode_info { unsigned int files; unsigned int r_counter; unsigned int w_counter; + unsigned int poll_usage; struct page *tmp_page; struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; -- cgit From f56ce412a59d7d938b81de8878faef128812482c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 19 Aug 2021 19:04:21 -0700 Subject: mm: memcontrol: fix occasional OOMs due to proportional memory.low reclaim We've noticed occasional OOM killing when memory.low settings are in effect for cgroups. This is unexpected and undesirable as memory.low is supposed to express non-OOMing memory priorities between cgroups. The reason for this is proportional memory.low reclaim. When cgroups are below their memory.low threshold, reclaim passes them over in the first round, and then retries if it couldn't find pages anywhere else. But when cgroups are slightly above their memory.low setting, page scan force is scaled down and diminished in proportion to the overage, to the point where it can cause reclaim to fail as well - only in that case we currently don't retry, and instead trigger OOM. To fix this, hook proportional reclaim into the same retry logic we have in place for when cgroups are skipped entirely. This way if reclaim fails and some cgroups were scanned with diminished pressure, we'll try another full-force cycle before giving up and OOMing. [akpm@linux-foundation.org: coding-style fixes] Link: https://lkml.kernel.org/r/20210817180506.220056-1-hannes@cmpxchg.org Fixes: 9783aa9917f8 ("mm, memcg: proportional memory.{low,min} reclaim") Signed-off-by: Johannes Weiner Reported-by: Leon Yang Reviewed-by: Rik van Riel Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Chris Down Acked-by: Michal Hocko Cc: [5.4+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 29 +++++++++++++++-------------- mm/vmscan.c | 27 +++++++++++++++++++-------- 2 files changed, 34 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bfe5c486f4ad..24797929d8a1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -612,12 +612,15 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -static inline unsigned long mem_cgroup_protection(struct mem_cgroup *root, - struct mem_cgroup *memcg, - bool in_low_reclaim) +static inline void mem_cgroup_protection(struct mem_cgroup *root, + struct mem_cgroup *memcg, + unsigned long *min, + unsigned long *low) { + *min = *low = 0; + if (mem_cgroup_disabled()) - return 0; + return; /* * There is no reclaim protection applied to a targeted reclaim. @@ -653,13 +656,10 @@ static inline unsigned long mem_cgroup_protection(struct mem_cgroup *root, * */ if (root == memcg) - return 0; - - if (in_low_reclaim) - return READ_ONCE(memcg->memory.emin); + return; - return max(READ_ONCE(memcg->memory.emin), - READ_ONCE(memcg->memory.elow)); + *min = READ_ONCE(memcg->memory.emin); + *low = READ_ONCE(memcg->memory.elow); } void mem_cgroup_calculate_protection(struct mem_cgroup *root, @@ -1147,11 +1147,12 @@ static inline void memcg_memory_event_mm(struct mm_struct *mm, { } -static inline unsigned long mem_cgroup_protection(struct mem_cgroup *root, - struct mem_cgroup *memcg, - bool in_low_reclaim) +static inline void mem_cgroup_protection(struct mem_cgroup *root, + struct mem_cgroup *memcg, + unsigned long *min, + unsigned long *low) { - return 0; + *min = *low = 0; } static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root, diff --git a/mm/vmscan.c b/mm/vmscan.c index 4620df62f0ff..b0202ab5e136 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -100,9 +100,12 @@ struct scan_control { unsigned int may_swap:1; /* - * Cgroups are not reclaimed below their configured memory.low, - * unless we threaten to OOM. If any cgroups are skipped due to - * memory.low and nothing was reclaimed, go back for memory.low. + * Cgroup memory below memory.low is protected as long as we + * don't threaten to OOM. If any cgroup is reclaimed at + * reduced force or passed over entirely due to its memory.low + * setting (memcg_low_skipped), and nothing is reclaimed as a + * result, then go back for one more cycle that reclaims the protected + * memory (memcg_low_reclaim) to avert OOM. */ unsigned int memcg_low_reclaim:1; unsigned int memcg_low_skipped:1; @@ -2537,15 +2540,14 @@ out: for_each_evictable_lru(lru) { int file = is_file_lru(lru); unsigned long lruvec_size; + unsigned long low, min; unsigned long scan; - unsigned long protection; lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - protection = mem_cgroup_protection(sc->target_mem_cgroup, - memcg, - sc->memcg_low_reclaim); + mem_cgroup_protection(sc->target_mem_cgroup, memcg, + &min, &low); - if (protection) { + if (min || low) { /* * Scale a cgroup's reclaim pressure by proportioning * its current usage to its memory.low or memory.min @@ -2576,6 +2578,15 @@ out: * hard protection. */ unsigned long cgroup_size = mem_cgroup_size(memcg); + unsigned long protection; + + /* memory.low scaling, make sure we retry before OOM */ + if (!sc->memcg_low_reclaim && low > min) { + protection = low; + sc->memcg_low_skipped = 1; + } else { + protection = min; + } /* Avoid TOCTOU with earlier protection check */ cgroup_size = max(cgroup_size, protection); -- cgit From a7cb5d23eaea148f8582229846f8dfff192f05c3 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 19 Aug 2021 19:04:30 -0700 Subject: kfence: fix is_kfence_address() for addresses below KFENCE_POOL_SIZE Originally the addr != NULL check was meant to take care of the case where __kfence_pool == NULL (KFENCE is disabled). However, this does not work for addresses where addr > 0 && addr < KFENCE_POOL_SIZE. This can be the case on NULL-deref where addr > 0 && addr < PAGE_SIZE or any other faulting access with addr < KFENCE_POOL_SIZE. While the kernel would likely crash, the stack traces and report might be confusing due to double faults upon KFENCE's attempt to unprotect such an address. Fix it by just checking that __kfence_pool != NULL instead. Link: https://lkml.kernel.org/r/20210818130300.2482437-1-elver@google.com Fixes: 0ce20dd84089 ("mm: add Kernel Electric-Fence infrastructure") Signed-off-by: Marco Elver Reported-by: Kuan-Ying Lee Acked-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: [5.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kfence.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kfence.h b/include/linux/kfence.h index a70d1ea03532..3fe6dd8a18c1 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -51,10 +51,11 @@ extern atomic_t kfence_allocation_gate; static __always_inline bool is_kfence_address(const void *addr) { /* - * The non-NULL check is required in case the __kfence_pool pointer was - * never initialized; keep it in the slow-path after the range-check. + * The __kfence_pool != NULL check is required to deal with the case + * where __kfence_pool == NULL && addr < KFENCE_POOL_SIZE. Keep it in + * the slow-path after the range-check! */ - return unlikely((unsigned long)((char *)addr - __kfence_pool) < KFENCE_POOL_SIZE && addr); + return unlikely((unsigned long)((char *)addr - __kfence_pool) < KFENCE_POOL_SIZE && __kfence_pool); } /** -- cgit