aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kernel/cpu
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel/cpu')
-rw-r--r--arch/x86/kernel/cpu/.gitignore1
-rw-r--r--arch/x86/kernel/cpu/amd.c33
-rw-r--r--arch/x86/kernel/cpu/common.c6
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c224
-rw-r--r--arch/x86/kernel/cpu/match.c13
-rw-r--r--arch/x86/kernel/cpu/mce/core.c16
-rw-r--r--arch/x86/kernel/cpu/mce/dev-mcelog.c47
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c17
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h5
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c14
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h1
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c16
-rw-r--r--arch/x86/kernel/cpu/umwait.c1
-rw-r--r--arch/x86/kernel/cpu/vmware.c229
16 files changed, 586 insertions, 40 deletions
diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore
index 667df55a4399..0bca7ef7426a 100644
--- a/arch/x86/kernel/cpu/.gitignore
+++ b/arch/x86/kernel/cpu/.gitignore
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
capflags.c
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 1f875fbe1384..547ad7bbf0e0 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -394,6 +394,35 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
}
+static void amd_detect_ppin(struct cpuinfo_x86 *c)
+{
+ unsigned long long val;
+
+ if (!cpu_has(c, X86_FEATURE_AMD_PPIN))
+ return;
+
+ /* When PPIN is defined in CPUID, still need to check PPIN_CTL MSR */
+ if (rdmsrl_safe(MSR_AMD_PPIN_CTL, &val))
+ goto clear_ppin;
+
+ /* PPIN is locked in disabled mode, clear feature bit */
+ if ((val & 3UL) == 1UL)
+ goto clear_ppin;
+
+ /* If PPIN is disabled, try to enable it */
+ if (!(val & 2UL)) {
+ wrmsrl_safe(MSR_AMD_PPIN_CTL, val | 2UL);
+ rdmsrl_safe(MSR_AMD_PPIN_CTL, &val);
+ }
+
+ /* If PPIN_EN bit is 1, return from here; otherwise fall through */
+ if (val & 2UL)
+ return;
+
+clear_ppin:
+ clear_cpu_cap(c, X86_FEATURE_AMD_PPIN);
+}
+
u16 amd_get_nb_id(int cpu)
{
return per_cpu(cpu_llc_id, cpu);
@@ -926,7 +955,8 @@ static void init_amd(struct cpuinfo_x86 *c)
case 0x12: init_amd_ln(c); break;
case 0x15: init_amd_bd(c); break;
case 0x16: init_amd_jg(c); break;
- case 0x17: init_amd_zn(c); break;
+ case 0x17: fallthrough;
+ case 0x19: init_amd_zn(c); break;
}
/*
@@ -941,6 +971,7 @@ static void init_amd(struct cpuinfo_x86 *c)
amd_detect_cmp(c);
amd_get_topology(c);
srat_detect_node(c);
+ amd_detect_ppin(c);
init_amd_cacheinfo(c);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4cdb123ff66a..bed0cb83fe24 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1008,8 +1008,8 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
#define NO_ITLB_MULTIHIT BIT(7)
#define NO_SPECTRE_V2 BIT(8)
-#define VULNWL(_vendor, _family, _model, _whitelist) \
- { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
+#define VULNWL(vendor, family, model, whitelist) \
+ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist)
#define VULNWL_INTEL(model, whitelist) \
VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist)
@@ -1224,6 +1224,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
cpu_set_bug_bits(c);
+ cpu_set_core_cap_bits(c);
+
fpu__init_system(c);
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 0268185bef94..29a3bedabd06 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -5,6 +5,7 @@
#include <asm/msr-index.h>
#include <asm/processor.h>
#include <asm/vmx.h>
+#include "cpu.h"
#undef pr_fmt
#define pr_fmt(fmt) "x86/cpu: " fmt
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index be82cd5841c3..a19a680542ce 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -19,6 +19,9 @@
#include <asm/microcode_intel.h>
#include <asm/hwcap2.h>
#include <asm/elf.h>
+#include <asm/cpu_device_id.h>
+#include <asm/cmdline.h>
+#include <asm/traps.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
@@ -31,6 +34,20 @@
#include <asm/apic.h>
#endif
+enum split_lock_detect_state {
+ sld_off = 0,
+ sld_warn,
+ sld_fatal,
+};
+
+/*
+ * Default to sld_off because most systems do not support split lock detection
+ * split_lock_setup() will switch this to sld_warn on systems that support
+ * split lock detect, unless there is a command line override.
+ */
+static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
+static u64 msr_test_ctrl_cache __ro_after_init;
+
/*
* Processors which have self-snooping capability can handle conflicting
* memory type across CPUs by snooping its own cache. However, there exists
@@ -570,6 +587,8 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
wrmsrl(MSR_MISC_FEATURES_ENABLES, msr);
}
+static void split_lock_init(void);
+
static void init_intel(struct cpuinfo_x86 *c)
{
early_init_intel(c);
@@ -684,6 +703,8 @@ static void init_intel(struct cpuinfo_x86 *c)
tsx_enable();
if (tsx_ctrl_state == TSX_CTRL_DISABLE)
tsx_disable();
+
+ split_lock_init();
}
#ifdef CONFIG_X86_32
@@ -945,3 +966,206 @@ static const struct cpu_dev intel_cpu_dev = {
};
cpu_dev_register(intel_cpu_dev);
+
+#undef pr_fmt
+#define pr_fmt(fmt) "x86/split lock detection: " fmt
+
+static const struct {
+ const char *option;
+ enum split_lock_detect_state state;
+} sld_options[] __initconst = {
+ { "off", sld_off },
+ { "warn", sld_warn },
+ { "fatal", sld_fatal },
+};
+
+static inline bool match_option(const char *arg, int arglen, const char *opt)
+{
+ int len = strlen(opt);
+
+ return len == arglen && !strncmp(arg, opt, len);
+}
+
+static bool split_lock_verify_msr(bool on)
+{
+ u64 ctrl, tmp;
+
+ if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl))
+ return false;
+ if (on)
+ ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+ else
+ ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+ if (wrmsrl_safe(MSR_TEST_CTRL, ctrl))
+ return false;
+ rdmsrl(MSR_TEST_CTRL, tmp);
+ return ctrl == tmp;
+}
+
+static void __init split_lock_setup(void)
+{
+ enum split_lock_detect_state state = sld_warn;
+ char arg[20];
+ int i, ret;
+
+ if (!split_lock_verify_msr(false)) {
+ pr_info("MSR access failed: Disabled\n");
+ return;
+ }
+
+ ret = cmdline_find_option(boot_command_line, "split_lock_detect",
+ arg, sizeof(arg));
+ if (ret >= 0) {
+ for (i = 0; i < ARRAY_SIZE(sld_options); i++) {
+ if (match_option(arg, ret, sld_options[i].option)) {
+ state = sld_options[i].state;
+ break;
+ }
+ }
+ }
+
+ switch (state) {
+ case sld_off:
+ pr_info("disabled\n");
+ return;
+ case sld_warn:
+ pr_info("warning about user-space split_locks\n");
+ break;
+ case sld_fatal:
+ pr_info("sending SIGBUS on user-space split_locks\n");
+ break;
+ }
+
+ rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
+ if (!split_lock_verify_msr(true)) {
+ pr_info("MSR access failed: Disabled\n");
+ return;
+ }
+
+ sld_state = state;
+ setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
+}
+
+/*
+ * MSR_TEST_CTRL is per core, but we treat it like a per CPU MSR. Locking
+ * is not implemented as one thread could undo the setting of the other
+ * thread immediately after dropping the lock anyway.
+ */
+static void sld_update_msr(bool on)
+{
+ u64 test_ctrl_val = msr_test_ctrl_cache;
+
+ if (on)
+ test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT;
+
+ wrmsrl(MSR_TEST_CTRL, test_ctrl_val);
+}
+
+static void split_lock_init(void)
+{
+ split_lock_verify_msr(sld_state != sld_off);
+}
+
+static void split_lock_warn(unsigned long ip)
+{
+ pr_warn_ratelimited("#AC: %s/%d took a split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, ip);
+
+ /*
+ * Disable the split lock detection for this task so it can make
+ * progress and set TIF_SLD so the detection is re-enabled via
+ * switch_to_sld() when the task is scheduled out.
+ */
+ sld_update_msr(false);
+ set_tsk_thread_flag(current, TIF_SLD);
+}
+
+bool handle_guest_split_lock(unsigned long ip)
+{
+ if (sld_state == sld_warn) {
+ split_lock_warn(ip);
+ return true;
+ }
+
+ pr_warn_once("#AC: %s/%d %s split_lock trap at address: 0x%lx\n",
+ current->comm, current->pid,
+ sld_state == sld_fatal ? "fatal" : "bogus", ip);
+
+ current->thread.error_code = 0;
+ current->thread.trap_nr = X86_TRAP_AC;
+ force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+ return false;
+}
+EXPORT_SYMBOL_GPL(handle_guest_split_lock);
+
+bool handle_user_split_lock(struct pt_regs *regs, long error_code)
+{
+ if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
+ return false;
+ split_lock_warn(regs->ip);
+ return true;
+}
+
+/*
+ * This function is called only when switching between tasks with
+ * different split-lock detection modes. It sets the MSR for the
+ * mode of the new task. This is right most of the time, but since
+ * the MSR is shared by hyperthreads on a physical core there can
+ * be glitches when the two threads need different modes.
+ */
+void switch_to_sld(unsigned long tifn)
+{
+ sld_update_msr(!(tifn & _TIF_SLD));
+}
+
+/*
+ * Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should
+ * only be trusted if it is confirmed that a CPU model implements a
+ * specific feature at a particular bit position.
+ *
+ * The possible driver data field values:
+ *
+ * - 0: CPU models that are known to have the per-core split-lock detection
+ * feature even though they do not enumerate IA32_CORE_CAPABILITIES.
+ *
+ * - 1: CPU models which may enumerate IA32_CORE_CAPABILITIES and if so use
+ * bit 5 to enumerate the per-core split-lock detection feature.
+ */
+static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0),
+ X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, 1),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, 1),
+ X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, 1),
+ {}
+};
+
+void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c)
+{
+ const struct x86_cpu_id *m;
+ u64 ia32_core_caps;
+
+ if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return;
+
+ m = x86_match_cpu(split_lock_cpu_ids);
+ if (!m)
+ return;
+
+ switch (m->driver_data) {
+ case 0:
+ break;
+ case 1:
+ if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES))
+ return;
+ rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps);
+ if (!(ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT))
+ return;
+ break;
+ default:
+ return;
+ }
+
+ split_lock_setup();
+}
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
index 6dd78d8235e4..d3482eb43ff3 100644
--- a/arch/x86/kernel/cpu/match.c
+++ b/arch/x86/kernel/cpu/match.c
@@ -16,12 +16,17 @@
* respective wildcard entries.
*
* A typical table entry would be to match a specific CPU
- * { X86_VENDOR_INTEL, 6, 0x12 }
- * or to match a specific CPU feature
- * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) }
+ *
+ * X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, INTEL_FAM6_BROADWELL,
+ * X86_FEATURE_ANY, NULL);
*
* Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
- * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
+ * %X86_MODEL_ANY, %X86_FEATURE_ANY (except for vendor)
+ *
+ * asm/cpu_device_id.h contains a set of useful macros which are shortcuts
+ * for various common selections. The above can be shortened to:
+ *
+ * X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, NULL);
*
* Arrays used to match for this should also be declared using
* MODULE_DEVICE_TABLE(x86cpu, ...)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 2c4f949611e4..54165f3569e8 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -142,6 +142,8 @@ void mce_setup(struct mce *m)
if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
rdmsrl(MSR_PPIN, m->ppin);
+ else if (this_cpu_has(X86_FEATURE_AMD_PPIN))
+ rdmsrl(MSR_AMD_PPIN, m->ppin);
m->microcode = boot_cpu_data.microcode;
}
@@ -1213,8 +1215,14 @@ static void __mc_scan_banks(struct mce *m, struct mce *final,
* On Intel systems this is entered on all CPUs in parallel through
* MCE broadcast. However some CPUs might be broken beyond repair,
* so be always careful when synchronizing with others.
+ *
+ * Tracing and kprobes are disabled: if we interrupted a kernel context
+ * with IF=1, we need to minimize stack usage. There are also recursion
+ * issues: if the machine check was due to a failure of the memory
+ * backing the user stack, tracing that reads the user stack will cause
+ * potentially infinite recursion.
*/
-void do_machine_check(struct pt_regs *regs, long error_code)
+void notrace do_machine_check(struct pt_regs *regs, long error_code)
{
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
@@ -1360,6 +1368,7 @@ out_ist:
ist_exit(regs);
}
EXPORT_SYMBOL_GPL(do_machine_check);
+NOKPROBE_SYMBOL(do_machine_check);
#ifndef CONFIG_MEMORY_FAILURE
int memory_failure(unsigned long pfn, int flags)
@@ -1877,6 +1886,8 @@ bool filter_mce(struct mce *m)
{
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
return amd_filter_mce(m);
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ return intel_filter_mce(m);
return false;
}
@@ -1892,10 +1903,11 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code)
void (*machine_check_vector)(struct pt_regs *, long error_code) =
unexpected_machine_check;
-dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
+dotraplinkage notrace void do_mce(struct pt_regs *regs, long error_code)
{
machine_check_vector(regs, error_code);
}
+NOKPROBE_SYMBOL(do_mce);
/*
* Called for each booted CPU to set up machine checks.
diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c
index 7c8958dee103..d089567a9ce8 100644
--- a/arch/x86/kernel/cpu/mce/dev-mcelog.c
+++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c
@@ -29,11 +29,7 @@ static char *mce_helper_argv[2] = { mce_helper, NULL };
* separate MCEs from kernel messages to avoid bogus bug reports.
*/
-static struct mce_log_buffer mcelog = {
- .signature = MCE_LOG_SIGNATURE,
- .len = MCE_LOG_LEN,
- .recordlen = sizeof(struct mce),
-};
+static struct mce_log_buffer *mcelog;
static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
@@ -45,21 +41,21 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val,
mutex_lock(&mce_chrdev_read_mutex);
- entry = mcelog.next;
+ entry = mcelog->next;
/*
* When the buffer fills up discard new entries. Assume that the
* earlier errors are the more interesting ones:
*/
- if (entry >= MCE_LOG_LEN) {
- set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
+ if (entry >= mcelog->len) {
+ set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog->flags);
goto unlock;
}
- mcelog.next = entry + 1;
+ mcelog->next = entry + 1;
- memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
- mcelog.entry[entry].finished = 1;
+ memcpy(mcelog->entry + entry, mce, sizeof(struct mce));
+ mcelog->entry[entry].finished = 1;
/* wake processes polling /dev/mcelog */
wake_up_interruptible(&mce_chrdev_wait);
@@ -214,21 +210,21 @@ static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
/* Only supports full reads right now */
err = -EINVAL;
- if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
+ if (*off != 0 || usize < mcelog->len * sizeof(struct mce))
goto out;
- next = mcelog.next;
+ next = mcelog->next;
err = 0;
for (i = 0; i < next; i++) {
- struct mce *m = &mcelog.entry[i];
+ struct mce *m = &mcelog->entry[i];
err |= copy_to_user(buf, m, sizeof(*m));
buf += sizeof(*m);
}
- memset(mcelog.entry, 0, next * sizeof(struct mce));
- mcelog.next = 0;
+ memset(mcelog->entry, 0, next * sizeof(struct mce));
+ mcelog->next = 0;
if (err)
err = -EFAULT;
@@ -242,7 +238,7 @@ out:
static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait)
{
poll_wait(file, &mce_chrdev_wait, wait);
- if (READ_ONCE(mcelog.next))
+ if (READ_ONCE(mcelog->next))
return EPOLLIN | EPOLLRDNORM;
if (!mce_apei_read_done && apei_check_mce())
return EPOLLIN | EPOLLRDNORM;
@@ -261,13 +257,13 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
case MCE_GET_RECORD_LEN:
return put_user(sizeof(struct mce), p);
case MCE_GET_LOG_LEN:
- return put_user(MCE_LOG_LEN, p);
+ return put_user(mcelog->len, p);
case MCE_GETCLEAR_FLAGS: {
unsigned flags;
do {
- flags = mcelog.flags;
- } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
+ flags = mcelog->flags;
+ } while (cmpxchg(&mcelog->flags, flags, 0) != flags);
return put_user(flags, p);
}
@@ -339,8 +335,18 @@ static struct miscdevice mce_chrdev_device = {
static __init int dev_mcelog_init_device(void)
{
+ int mce_log_len;
int err;
+ mce_log_len = max(MCE_LOG_MIN_LEN, num_online_cpus());
+ mcelog = kzalloc(sizeof(*mcelog) + mce_log_len * sizeof(struct mce), GFP_KERNEL);
+ if (!mcelog)
+ return -ENOMEM;
+
+ strncpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature));
+ mcelog->len = mce_log_len;
+ mcelog->recordlen = sizeof(struct mce);
+
/* register character device /dev/mcelog */
err = misc_register(&mce_chrdev_device);
if (err) {
@@ -350,6 +356,7 @@ static __init int dev_mcelog_init_device(void)
else
pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
+ kfree(mcelog);
return err;
}
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index f996ffb887bc..d8f9230d2034 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -521,3 +521,20 @@ void mce_intel_feature_clear(struct cpuinfo_x86 *c)
{
intel_clear_lmce();
}
+
+bool intel_filter_mce(struct mce *m)
+{
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ /* MCE errata HSD131, HSM142, HSW131, BDM48, and HSM142 */
+ if ((c->x86 == 6) &&
+ ((c->x86_model == INTEL_FAM6_HASWELL) ||
+ (c->x86_model == INTEL_FAM6_HASWELL_L) ||
+ (c->x86_model == INTEL_FAM6_BROADWELL) ||
+ (c->x86_model == INTEL_FAM6_HASWELL_G)) &&
+ (m->bank == 0) &&
+ ((m->status & 0xa0000000ffffffff) == 0x80000000000f0005))
+ return true;
+
+ return false;
+}
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index b785c0d0b590..3b008172ad73 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -8,6 +8,9 @@
#include <linux/device.h>
#include <asm/mce.h>
+/* Pointer to the installed machine check handler for this CPU setup. */
+extern void (*machine_check_vector)(struct pt_regs *, long error_code);
+
enum severity_level {
MCE_NO_SEVERITY,
MCE_DEFERRED_SEVERITY,
@@ -48,6 +51,7 @@ void cmci_disable_bank(int bank);
void intel_init_cmci(void);
void intel_init_lmce(void);
void intel_clear_lmce(void);
+bool intel_filter_mce(struct mce *m);
#else
# define cmci_intel_adjust_timer mce_adjust_timer_default
static inline bool mce_intel_cmci_poll(void) { return false; }
@@ -56,6 +60,7 @@ static inline void cmci_disable_bank(int bank) { }
static inline void intel_init_cmci(void) { }
static inline void intel_init_lmce(void) { }
static inline void intel_clear_lmce(void) { }
+static inline bool intel_filter_mce(struct mce *m) { return false; };
#endif
void mce_timer_kick(unsigned long interval);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index caa032ce3fe3..ebf34c7bc8bc 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -227,8 +227,8 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
- pr_info("Hyper-V: features 0x%x, hints 0x%x\n",
- ms_hyperv.features, ms_hyperv.hints);
+ pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
+ ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
@@ -263,6 +263,16 @@ static void __init ms_hyperv_init_platform(void)
cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
}
+ /*
+ * Hyper-V expects to get crash register data or kmsg when
+ * crash enlightment is available and system crashes. Set
+ * crash_kexec_post_notifiers to be true to make sure that
+ * calling crash enlightment interface before running kdump
+ * kernel.
+ */
+ if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE)
+ crash_kexec_post_notifiers = true;
+
#ifdef CONFIG_X86_LOCAL_APIC
if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 89049b343c7a..d8cc5223b7ce 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -578,6 +578,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
d->id = id;
cpumask_set_cpu(cpu, &d->cpu_mask);
+ rdt_domain_reconfigure_cdp(r);
+
if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
kfree(d);
return;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 181c992f448c..3dd13f3a8b23 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -601,5 +601,6 @@ bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
void __check_limbo(struct rdt_domain *d, bool force_free);
bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r);
bool cbm_validate_amd(char *buf, u32 *data, struct rdt_resource *r);
+void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
#endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 064e9ef44cd6..5a359d9fcc05 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1859,6 +1859,19 @@ static int set_cache_qos_cfg(int level, bool enable)
return 0;
}
+/* Restore the qos cfg state when a domain comes online */
+void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
+{
+ if (!r->alloc_capable)
+ return;
+
+ if (r == &rdt_resources_all[RDT_RESOURCE_L2DATA])
+ l2_qos_cfg_update(&r->alloc_enabled);
+
+ if (r == &rdt_resources_all[RDT_RESOURCE_L3DATA])
+ l3_qos_cfg_update(&r->alloc_enabled);
+}
+
/*
* Enable or disable the MBA software controller
* which helps user specify bandwidth in MBps.
@@ -3072,7 +3085,8 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
* If the rdtgroup is a mon group and parent directory
* is a valid "mon_groups" directory, remove the mon group.
*/
- if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) {
+ if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
+ rdtgrp != &rdtgroup_default) {
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
ret = rdtgroup_ctrl_remove(kn, rdtgrp);
diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c
index c222f283b456..300e3fd5ade3 100644
--- a/arch/x86/kernel/cpu/umwait.c
+++ b/arch/x86/kernel/cpu/umwait.c
@@ -4,6 +4,7 @@
#include <linux/cpu.h>
#include <asm/msr.h>
+#include <asm/mwait.h>
#define UMWAIT_C02_ENABLE 0
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 46d732696c1c..9b6fafa69be9 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -25,6 +25,8 @@
#include <linux/init.h>
#include <linux/export.h>
#include <linux/clocksource.h>
+#include <linux/cpu.h>
+#include <linux/reboot.h>
#include <asm/div64.h>
#include <asm/x86_init.h>
#include <asm/hypervisor.h>
@@ -47,6 +49,11 @@
#define VMWARE_CMD_GETVCPU_INFO 68
#define VMWARE_CMD_LEGACY_X2APIC 3
#define VMWARE_CMD_VCPU_RESERVED 31
+#define VMWARE_CMD_STEALCLOCK 91
+
+#define STEALCLOCK_NOT_AVAILABLE (-1)
+#define STEALCLOCK_DISABLED 0
+#define STEALCLOCK_ENABLED 1
#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
__asm__("inl (%%dx), %%eax" : \
@@ -86,6 +93,18 @@
} \
} while (0)
+struct vmware_steal_time {
+ union {
+ uint64_t clock; /* stolen time counter in units of vtsc */
+ struct {
+ /* only for little-endian */
+ uint32_t clock_low;
+ uint32_t clock_high;
+ };
+ };
+ uint64_t reserved[7];
+};
+
static unsigned long vmware_tsc_khz __ro_after_init;
static u8 vmware_hypercall_mode __ro_after_init;
@@ -103,15 +122,25 @@ static unsigned long vmware_get_tsc_khz(void)
#ifdef CONFIG_PARAVIRT
static struct cyc2ns_data vmware_cyc2ns __ro_after_init;
-static int vmw_sched_clock __initdata = 1;
+static bool vmw_sched_clock __initdata = true;
+static DEFINE_PER_CPU_DECRYPTED(struct vmware_steal_time, vmw_steal_time) __aligned(64);
+static bool has_steal_clock;
+static bool steal_acc __initdata = true; /* steal time accounting */
static __init int setup_vmw_sched_clock(char *s)
{
- vmw_sched_clock = 0;
+ vmw_sched_clock = false;
return 0;
}
early_param("no-vmw-sched-clock", setup_vmw_sched_clock);
+static __init int parse_no_stealacc(char *arg)
+{
+ steal_acc = false;
+ return 0;
+}
+early_param("no-steal-acc", parse_no_stealacc);
+
static unsigned long long notrace vmware_sched_clock(void)
{
unsigned long long ns;
@@ -122,7 +151,7 @@ static unsigned long long notrace vmware_sched_clock(void)
return ns;
}
-static void __init vmware_sched_clock_setup(void)
+static void __init vmware_cyc2ns_setup(void)
{
struct cyc2ns_data *d = &vmware_cyc2ns;
unsigned long long tsc_now = rdtsc();
@@ -132,17 +161,201 @@ static void __init vmware_sched_clock_setup(void)
d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul,
d->cyc2ns_shift);
- pv_ops.time.sched_clock = vmware_sched_clock;
- pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset);
+ pr_info("using clock offset of %llu ns\n", d->cyc2ns_offset);
+}
+
+static int vmware_cmd_stealclock(uint32_t arg1, uint32_t arg2)
+{
+ uint32_t result, info;
+
+ asm volatile (VMWARE_HYPERCALL :
+ "=a"(result),
+ "=c"(info) :
+ "a"(VMWARE_HYPERVISOR_MAGIC),
+ "b"(0),
+ "c"(VMWARE_CMD_STEALCLOCK),
+ "d"(0),
+ "S"(arg1),
+ "D"(arg2) :
+ "memory");
+ return result;
+}
+
+static bool stealclock_enable(phys_addr_t pa)
+{
+ return vmware_cmd_stealclock(upper_32_bits(pa),
+ lower_32_bits(pa)) == STEALCLOCK_ENABLED;
+}
+
+static int __stealclock_disable(void)
+{
+ return vmware_cmd_stealclock(0, 1);
+}
+
+static void stealclock_disable(void)
+{
+ __stealclock_disable();
+}
+
+static bool vmware_is_stealclock_available(void)
+{
+ return __stealclock_disable() != STEALCLOCK_NOT_AVAILABLE;
+}
+
+/**
+ * vmware_steal_clock() - read the per-cpu steal clock
+ * @cpu: the cpu number whose steal clock we want to read
+ *
+ * The function reads the steal clock if we are on a 64-bit system, otherwise
+ * reads it in parts, checking that the high part didn't change in the
+ * meantime.
+ *
+ * Return:
+ * The steal clock reading in ns.
+ */
+static uint64_t vmware_steal_clock(int cpu)
+{
+ struct vmware_steal_time *steal = &per_cpu(vmw_steal_time, cpu);
+ uint64_t clock;
+
+ if (IS_ENABLED(CONFIG_64BIT))
+ clock = READ_ONCE(steal->clock);
+ else {
+ uint32_t initial_high, low, high;
+
+ do {
+ initial_high = READ_ONCE(steal->clock_high);
+ /* Do not reorder initial_high and high readings */
+ virt_rmb();
+ low = READ_ONCE(steal->clock_low);
+ /* Keep low reading in between */
+ virt_rmb();
+ high = READ_ONCE(steal->clock_high);
+ } while (initial_high != high);
+
+ clock = ((uint64_t)high << 32) | low;
+ }
+
+ return mul_u64_u32_shr(clock, vmware_cyc2ns.cyc2ns_mul,
+ vmware_cyc2ns.cyc2ns_shift);
+}
+
+static void vmware_register_steal_time(void)
+{
+ int cpu = smp_processor_id();
+ struct vmware_steal_time *st = &per_cpu(vmw_steal_time, cpu);
+
+ if (!has_steal_clock)
+ return;
+
+ if (!stealclock_enable(slow_virt_to_phys(st))) {
+ has_steal_clock = false;
+ return;
+ }
+
+ pr_info("vmware-stealtime: cpu %d, pa %llx\n",
+ cpu, (unsigned long long) slow_virt_to_phys(st));
}
+static void vmware_disable_steal_time(void)
+{
+ if (!has_steal_clock)
+ return;
+
+ stealclock_disable();
+}
+
+static void vmware_guest_cpu_init(void)
+{
+ if (has_steal_clock)
+ vmware_register_steal_time();
+}
+
+static void vmware_pv_guest_cpu_reboot(void *unused)
+{
+ vmware_disable_steal_time();
+}
+
+static int vmware_pv_reboot_notify(struct notifier_block *nb,
+ unsigned long code, void *unused)
+{
+ if (code == SYS_RESTART)
+ on_each_cpu(vmware_pv_guest_cpu_reboot, NULL, 1);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block vmware_pv_reboot_nb = {
+ .notifier_call = vmware_pv_reboot_notify,
+};
+
+#ifdef CONFIG_SMP
+static void __init vmware_smp_prepare_boot_cpu(void)
+{
+ vmware_guest_cpu_init();
+ native_smp_prepare_boot_cpu();
+}
+
+static int vmware_cpu_online(unsigned int cpu)
+{
+ local_irq_disable();
+ vmware_guest_cpu_init();
+ local_irq_enable();
+ return 0;
+}
+
+static int vmware_cpu_down_prepare(unsigned int cpu)
+{
+ local_irq_disable();
+ vmware_disable_steal_time();
+ local_irq_enable();
+ return 0;
+}
+#endif
+
+static __init int activate_jump_labels(void)
+{
+ if (has_steal_clock) {
+ static_key_slow_inc(&paravirt_steal_enabled);
+ if (steal_acc)
+ static_key_slow_inc(&paravirt_steal_rq_enabled);
+ }
+
+ return 0;
+}
+arch_initcall(activate_jump_labels);
+
static void __init vmware_paravirt_ops_setup(void)
{
pv_info.name = "VMware hypervisor";
pv_ops.cpu.io_delay = paravirt_nop;
- if (vmware_tsc_khz && vmw_sched_clock)
- vmware_sched_clock_setup();
+ if (vmware_tsc_khz == 0)
+ return;
+
+ vmware_cyc2ns_setup();
+
+ if (vmw_sched_clock)
+ pv_ops.time.sched_clock = vmware_sched_clock;
+
+ if (vmware_is_stealclock_available()) {
+ has_steal_clock = true;
+ pv_ops.time.steal_clock = vmware_steal_clock;
+
+ /* We use reboot notifier only to disable steal clock */
+ register_reboot_notifier(&vmware_pv_reboot_nb);
+
+#ifdef CONFIG_SMP
+ smp_ops.smp_prepare_boot_cpu =
+ vmware_smp_prepare_boot_cpu;
+ if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "x86/vmware:online",
+ vmware_cpu_online,
+ vmware_cpu_down_prepare) < 0)
+ pr_err("vmware_guest: Failed to install cpu hotplug callbacks\n");
+#else
+ vmware_guest_cpu_init();
+#endif
+ }
}
#else
#define vmware_paravirt_ops_setup() do {} while (0)
@@ -213,7 +426,7 @@ static void __init vmware_platform_setup(void)
vmware_set_capabilities();
}
-static u8 vmware_select_hypercall(void)
+static u8 __init vmware_select_hypercall(void)
{
int eax, ebx, ecx, edx;