aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/acpi/apei.c5
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S2
-rw-r--r--arch/x86/kernel/alternative.c2
-rw-r--r--arch/x86/kernel/amd_nb.c4
-rw-r--r--arch/x86/kernel/apic/apic.c73
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c18
-rw-r--r--arch/x86/kernel/apic/apic_noop.c10
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c16
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c9
-rw-r--r--arch/x86/kernel/apic/io_apic.c525
-rw-r--r--arch/x86/kernel/apic/ipi.c6
-rw-r--r--arch/x86/kernel/apic/msi.c153
-rw-r--r--arch/x86/kernel/apic/probe_32.c9
-rw-r--r--arch/x86/kernel/apic/vector.c49
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c17
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c35
-rw-r--r--arch/x86/kernel/asm-offsets.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c38
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c16
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c8
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c1
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c38
-rw-r--r--arch/x86/kernel/cpu/hygon.c31
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c4
-rw-r--r--arch/x86/kernel/cpu/mce/apei.c61
-rw-r--r--arch/x86/kernel/cpu/mce/core.c49
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c4
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c21
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c1
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c29
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c5
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c4
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h3
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c82
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c21
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/cpu/sgx/Makefile5
-rw-r--r--arch/x86/kernel/cpu/sgx/arch.h338
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c194
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.h29
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c740
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.h119
-rw-r--r--arch/x86/kernel/cpu/sgx/encls.h231
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c716
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c733
-rw-r--r--arch/x86/kernel/cpu/sgx/sgx.h86
-rw-r--r--arch/x86/kernel/cpu/topology.c10
-rw-r--r--arch/x86/kernel/cpu/vmware.c12
-rw-r--r--arch/x86/kernel/cpuid.c7
-rw-r--r--arch/x86/kernel/crash_dump_32.c48
-rw-r--r--arch/x86/kernel/devicetree.c30
-rw-r--r--arch/x86/kernel/dumpstack.c2
-rw-r--r--arch/x86/kernel/ftrace_64.S15
-rw-r--r--arch/x86/kernel/head64.c1
-rw-r--r--arch/x86/kernel/head_64.S29
-rw-r--r--arch/x86/kernel/hpet.c122
-rw-r--r--arch/x86/kernel/ima_arch.c94
-rw-r--r--arch/x86/kernel/kprobes/core.c6
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c15
-rw-r--r--arch/x86/kernel/kvm.c6
-rw-r--r--arch/x86/kernel/kvmclock.c1
-rw-r--r--arch/x86/kernel/msr.c8
-rw-r--r--arch/x86/kernel/nmi.c6
-rw-r--r--arch/x86/kernel/perf_regs.c2
-rw-r--r--arch/x86/kernel/process_64.c28
-rw-r--r--arch/x86/kernel/setup.c11
-rw-r--r--arch/x86/kernel/signal.c4
-rw-r--r--arch/x86/kernel/signal_compat.c9
-rw-r--r--arch/x86/kernel/smpboot.c88
-rw-r--r--arch/x86/kernel/tboot.c1
-rw-r--r--arch/x86/kernel/topology.c1
-rw-r--r--arch/x86/kernel/traps.c26
-rw-r--r--arch/x86/kernel/uprobes.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S12
-rw-r--r--arch/x86/kernel/x86_init.c1
79 files changed, 4306 insertions, 848 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 68608bd892c0..5eeb808eb024 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -161,5 +161,3 @@ ifeq ($(CONFIG_X86_64),y)
obj-$(CONFIG_MMCONF_FAM10H) += mmconf-fam10h_64.o
obj-y += vsmp_64.o
endif
-
-obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c
index c22fb55abcfd..0916f00a992e 100644
--- a/arch/x86/kernel/acpi/apei.c
+++ b/arch/x86/kernel/acpi/apei.c
@@ -43,3 +43,8 @@ void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
apei_mce_report_mem_error(sev, mem_err);
#endif
}
+
+int arch_apei_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
+{
+ return apei_smca_report_x86_error(ctx_info, lapic_id);
+}
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index c8daa92f38dc..5d3a0b8fd379 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -112,7 +112,7 @@ SYM_FUNC_START(do_suspend_lowlevel)
movq pt_regs_r14(%rax), %r14
movq pt_regs_r15(%rax), %r15
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK
/*
* The suspend path may have poisoned some areas deeper in the stack,
* which we now need to unpoison.
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 2400ad62f330..8d778e46725d 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1374,7 +1374,7 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi
* @addr: address to patch
* @opcode: opcode of new instruction
* @len: length to copy
- * @handler: address to jump to when the temporary breakpoint is hit
+ * @emulate: instruction to be emulated
*
* Update a single instruction with the vector in the stack, avoiding
* dynamically allocated memory. This function should be used when it is
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 18f6b7c4bd79..b4396952c9a6 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -384,7 +384,7 @@ struct resource *amd_get_mmconfig_range(struct resource *res)
int amd_get_subcaches(int cpu)
{
- struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
+ struct pci_dev *link = node_to_amd_nb(topology_die_id(cpu))->link;
unsigned int mask;
if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
@@ -398,7 +398,7 @@ int amd_get_subcaches(int cpu)
int amd_set_subcaches(int cpu, unsigned long mask)
{
static unsigned int reset, ban;
- struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
+ struct amd_northbridge *nb = node_to_amd_nb(topology_die_id(cpu));
unsigned int reg;
int cuid;
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b3eef1d5c903..6bd20c0de8bc 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -94,6 +94,11 @@ static unsigned int disabled_cpu_apicid __ro_after_init = BAD_APICID;
static int apic_extnmi __ro_after_init = APIC_EXTNMI_BSP;
/*
+ * Hypervisor supports 15 bits of APIC ID in MSI Extended Destination ID
+ */
+static bool virt_ext_dest_id __ro_after_init;
+
+/*
* Map cpu index to physical APIC ID
*/
DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
@@ -1591,7 +1596,7 @@ static void setup_local_APIC(void)
apic->init_apic_ldr();
#ifdef CONFIG_X86_32
- if (apic->dest_logical) {
+ if (apic->dest_mode_logical) {
int logical_apicid, ldr_apicid;
/*
@@ -1841,20 +1846,34 @@ static __init void try_to_enable_x2apic(int remap_mode)
return;
if (remap_mode != IRQ_REMAP_X2APIC_MODE) {
- /* IR is required if there is APIC ID > 255 even when running
- * under KVM
+ u32 apic_limit = 255;
+
+ /*
+ * Using X2APIC without IR is not architecturally supported
+ * on bare metal but may be supported in guests.
*/
- if (max_physical_apicid > 255 ||
- !x86_init.hyper.x2apic_available()) {
+ if (!x86_init.hyper.x2apic_available()) {
pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n");
x2apic_disable();
return;
}
/*
- * without IR all CPUs can be addressed by IOAPIC/MSI
- * only in physical mode
+ * If the hypervisor supports extended destination ID in
+ * MSI, that increases the maximum APIC ID that can be
+ * used for non-remapped IRQ domains.
*/
+ if (x86_init.hyper.msi_ext_dest_id()) {
+ virt_ext_dest_id = 1;
+ apic_limit = 32767;
+ }
+
+ /*
+ * Without IR, all CPUs can be addressed by IOAPIC/MSI only
+ * in physical mode, and CPUs with an APIC ID that cannnot
+ * be addressed must not be brought online.
+ */
+ x2apic_set_max_apicid(apic_limit);
x2apic_phys = 1;
}
x2apic_enable();
@@ -2478,6 +2497,46 @@ int hard_smp_processor_id(void)
return read_apic_id();
}
+void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg,
+ bool dmar)
+{
+ memset(msg, 0, sizeof(*msg));
+
+ msg->arch_addr_lo.base_address = X86_MSI_BASE_ADDRESS_LOW;
+ msg->arch_addr_lo.dest_mode_logical = apic->dest_mode_logical;
+ msg->arch_addr_lo.destid_0_7 = cfg->dest_apicid & 0xFF;
+
+ msg->arch_data.delivery_mode = APIC_DELIVERY_MODE_FIXED;
+ msg->arch_data.vector = cfg->vector;
+
+ msg->address_hi = X86_MSI_BASE_ADDRESS_HIGH;
+ /*
+ * Only the IOMMU itself can use the trick of putting destination
+ * APIC ID into the high bits of the address. Anything else would
+ * just be writing to memory if it tried that, and needs IR to
+ * address APICs which can't be addressed in the normal 32-bit
+ * address range at 0xFFExxxxx. That is typically just 8 bits, but
+ * some hypervisors allow the extended destination ID field in bits
+ * 5-11 to be used, giving support for 15 bits of APIC IDs in total.
+ */
+ if (dmar)
+ msg->arch_addr_hi.destid_8_31 = cfg->dest_apicid >> 8;
+ else if (virt_ext_dest_id && cfg->dest_apicid < 0x8000)
+ msg->arch_addr_lo.virt_destid_8_14 = cfg->dest_apicid >> 8;
+ else
+ WARN_ON_ONCE(cfg->dest_apicid > 0xFF);
+}
+
+u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid)
+{
+ u32 dest = msg->arch_addr_lo.destid_0_7;
+
+ if (extid)
+ dest |= msg->arch_addr_hi.destid_8_31 << 8;
+ return dest;
+}
+EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid);
+
/*
* Override the generic EOI implementation with an optimized version.
* Only called during early boot when only one CPU is active and with
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 7862b152a052..8f72b4351c9f 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -53,7 +53,7 @@ static void _flat_send_IPI_mask(unsigned long mask, int vector)
unsigned long flags;
local_irq_save(flags);
- __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
+ __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
local_irq_restore(flags);
}
@@ -113,15 +113,13 @@ static struct apic apic_flat __ro_after_init = {
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = flat_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 1, /* logical */
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = true,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = NULL,
+ .check_apicid_used = NULL,
.init_apic_ldr = flat_init_apic_ldr,
-
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
@@ -206,15 +204,13 @@ static struct apic apic_physflat __ro_after_init = {
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = flat_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* physical */
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = false,
.disable_esr = 0,
- .dest_logical = 0,
- .check_apicid_used = NULL,
+ .check_apicid_used = NULL,
.init_apic_ldr = physflat_init_apic_ldr,
-
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index 780c702969b7..fe78319e0f7a 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -95,19 +95,15 @@ struct apic apic_noop __ro_after_init = {
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = noop_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- /* logical delivery broadcast to all CPUs: */
- .irq_dest_mode = 1,
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = true,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = default_check_apicid_used,
+ .check_apicid_used = default_check_apicid_used,
.init_apic_ldr = noop_init_apic_ldr,
-
.ioapic_phys_id_map = default_ioapic_phys_id_map,
.setup_apic_routing = NULL,
-
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = physid_set_mask_of_physid,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index 35edd57f064a..a54d817eb4b6 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -246,15 +246,13 @@ static const struct apic apic_numachip1 __refconst = {
.apic_id_valid = numachip_apic_id_valid,
.apic_id_registered = numachip_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* physical */
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = false,
.disable_esr = 0,
- .dest_logical = 0,
- .check_apicid_used = NULL,
+ .check_apicid_used = NULL,
.init_apic_ldr = flat_init_apic_ldr,
-
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
@@ -295,15 +293,13 @@ static const struct apic apic_numachip2 __refconst = {
.apic_id_valid = numachip_apic_id_valid,
.apic_id_registered = numachip_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* physical */
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = false,
.disable_esr = 0,
- .dest_logical = 0,
- .check_apicid_used = NULL,
+ .check_apicid_used = NULL,
.init_apic_ldr = flat_init_apic_ldr,
-
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 98d015a4405a..77555f66c14d 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -127,16 +127,13 @@ static struct apic apic_bigsmp __ro_after_init = {
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = bigsmp_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- /* phys delivery to target CPU: */
- .irq_dest_mode = 0,
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = false,
.disable_esr = 1,
- .dest_logical = 0,
- .check_apicid_used = bigsmp_check_apicid_used,
+ .check_apicid_used = bigsmp_check_apicid_used,
.init_apic_ldr = bigsmp_init_apic_ldr,
-
.ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
.setup_apic_routing = bigsmp_setup_apic_routing,
.cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7b3c7e0d4a09..e4ab4804b20d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -48,6 +48,7 @@
#include <linux/jiffies.h> /* time_after() */
#include <linux/slab.h>
#include <linux/memblock.h>
+#include <linux/msi.h>
#include <asm/irqdomain.h>
#include <asm/io.h>
@@ -63,7 +64,6 @@
#include <asm/setup.h>
#include <asm/irq_remapping.h>
#include <asm/hw_irq.h>
-
#include <asm/apic.h>
#define for_each_ioapic(idx) \
@@ -89,12 +89,12 @@ struct irq_pin_list {
};
struct mp_chip_data {
- struct list_head irq_2_pin;
- struct IO_APIC_route_entry entry;
- int trigger;
- int polarity;
+ struct list_head irq_2_pin;
+ struct IO_APIC_route_entry entry;
+ bool is_level;
+ bool active_low;
+ bool isa_irq;
u32 count;
- bool isa_irq;
};
struct mp_ioapic_gsi {
@@ -286,31 +286,26 @@ static void io_apic_write(unsigned int apic, unsigned int reg,
writel(value, &io_apic->data);
}
-union entry_union {
- struct { u32 w1, w2; };
- struct IO_APIC_route_entry entry;
-};
-
static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
{
- union entry_union eu;
+ struct IO_APIC_route_entry entry;
- eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
- eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
+ entry.w1 = io_apic_read(apic, 0x10 + 2 * pin);
+ entry.w2 = io_apic_read(apic, 0x11 + 2 * pin);
- return eu.entry;
+ return entry;
}
static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
{
- union entry_union eu;
+ struct IO_APIC_route_entry entry;
unsigned long flags;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- eu.entry = __ioapic_read_entry(apic, pin);
+ entry = __ioapic_read_entry(apic, pin);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- return eu.entry;
+ return entry;
}
/*
@@ -321,11 +316,8 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
*/
static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
- union entry_union eu = {{0, 0}};
-
- eu.entry = e;
- io_apic_write(apic, 0x11 + 2*pin, eu.w2);
- io_apic_write(apic, 0x10 + 2*pin, eu.w1);
+ io_apic_write(apic, 0x11 + 2*pin, e.w2);
+ io_apic_write(apic, 0x10 + 2*pin, e.w1);
}
static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
@@ -344,12 +336,12 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
*/
static void ioapic_mask_entry(int apic, int pin)
{
+ struct IO_APIC_route_entry e = { .masked = true };
unsigned long flags;
- union entry_union eu = { .entry.mask = IOAPIC_MASKED };
raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x10 + 2*pin, eu.w1);
- io_apic_write(apic, 0x11 + 2*pin, eu.w2);
+ io_apic_write(apic, 0x10 + 2*pin, e.w1);
+ io_apic_write(apic, 0x11 + 2*pin, e.w2);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
@@ -422,20 +414,15 @@ static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
add_pin_to_irq_node(data, node, newapic, newpin);
}
-static void io_apic_modify_irq(struct mp_chip_data *data,
- int mask_and, int mask_or,
+static void io_apic_modify_irq(struct mp_chip_data *data, bool masked,
void (*final)(struct irq_pin_list *entry))
{
- union entry_union eu;
struct irq_pin_list *entry;
- eu.entry = data->entry;
- eu.w1 &= mask_and;
- eu.w1 |= mask_or;
- data->entry = eu.entry;
+ data->entry.masked = masked;
for_each_irq_pin(entry, data->irq_2_pin) {
- io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1);
+ io_apic_write(entry->apic, 0x10 + 2 * entry->pin, data->entry.w1);
if (final)
final(entry);
}
@@ -459,13 +446,13 @@ static void mask_ioapic_irq(struct irq_data *irq_data)
unsigned long flags;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+ io_apic_modify_irq(data, true, &io_apic_sync);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void __unmask_ioapic(struct mp_chip_data *data)
{
- io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL);
+ io_apic_modify_irq(data, false, NULL);
}
static void unmask_ioapic_irq(struct irq_data *irq_data)
@@ -506,8 +493,8 @@ static void __eoi_ioapic_pin(int apic, int pin, int vector)
/*
* Mask the entry and change the trigger mode to edge.
*/
- entry1.mask = IOAPIC_MASKED;
- entry1.trigger = IOAPIC_EDGE;
+ entry1.masked = true;
+ entry1.is_level = false;
__ioapic_write_entry(apic, pin, entry1);
@@ -535,15 +522,15 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
/* Check delivery_mode to be sure we're not clearing an SMI pin */
entry = ioapic_read_entry(apic, pin);
- if (entry.delivery_mode == dest_SMI)
+ if (entry.delivery_mode == APIC_DELIVERY_MODE_SMI)
return;
/*
* Make sure the entry is masked and re-read the contents to check
* if it is a level triggered pin and if the remote-IRR is set.
*/
- if (entry.mask == IOAPIC_UNMASKED) {
- entry.mask = IOAPIC_MASKED;
+ if (!entry.masked) {
+ entry.masked = true;
ioapic_write_entry(apic, pin, entry);
entry = ioapic_read_entry(apic, pin);
}
@@ -556,8 +543,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
* doesn't clear the remote-IRR if the trigger mode is not
* set to level.
*/
- if (entry.trigger == IOAPIC_EDGE) {
- entry.trigger = IOAPIC_LEVEL;
+ if (!entry.is_level) {
+ entry.is_level = true;
ioapic_write_entry(apic, pin, entry);
}
raw_spin_lock_irqsave(&ioapic_lock, flags);
@@ -659,8 +646,8 @@ void mask_ioapic_entries(void)
struct IO_APIC_route_entry entry;
entry = ioapics[apic].saved_registers[pin];
- if (entry.mask == IOAPIC_UNMASKED) {
- entry.mask = IOAPIC_MASKED;
+ if (!entry.masked) {
+ entry.masked = true;
ioapic_write_entry(apic, pin, entry);
}
}
@@ -745,44 +732,7 @@ static int __init find_isa_irq_apic(int irq, int type)
return -1;
}
-#ifdef CONFIG_EISA
-/*
- * EISA Edge/Level control register, ELCR
- */
-static int EISA_ELCR(unsigned int irq)
-{
- if (irq < nr_legacy_irqs()) {
- unsigned int port = 0x4d0 + (irq >> 3);
- return (inb(port) >> (irq & 7)) & 1;
- }
- apic_printk(APIC_VERBOSE, KERN_INFO
- "Broken MPtable reports ISA irq %d\n", irq);
- return 0;
-}
-
-#endif
-
-/* ISA interrupts are always active high edge triggered,
- * when listed as conforming in the MP table. */
-
-#define default_ISA_trigger(idx) (IOAPIC_EDGE)
-#define default_ISA_polarity(idx) (IOAPIC_POL_HIGH)
-
-/* EISA interrupts are always polarity zero and can be edge or level
- * trigger depending on the ELCR value. If an interrupt is listed as
- * EISA conforming in the MP table, that means its trigger type must
- * be read in from the ELCR */
-
-#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].srcbusirq))
-#define default_EISA_polarity(idx) default_ISA_polarity(idx)
-
-/* PCI interrupts are always active low level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_PCI_trigger(idx) (IOAPIC_LEVEL)
-#define default_PCI_polarity(idx) (IOAPIC_POL_LOW)
-
-static int irq_polarity(int idx)
+static bool irq_active_low(int idx)
{
int bus = mp_irqs[idx].srcbus;
@@ -791,90 +741,139 @@ static int irq_polarity(int idx)
*/
switch (mp_irqs[idx].irqflag & MP_IRQPOL_MASK) {
case MP_IRQPOL_DEFAULT:
- /* conforms to spec, ie. bus-type dependent polarity */
- if (test_bit(bus, mp_bus_not_pci))
- return default_ISA_polarity(idx);
- else
- return default_PCI_polarity(idx);
+ /*
+ * Conforms to spec, ie. bus-type dependent polarity. PCI
+ * defaults to low active. [E]ISA defaults to high active.
+ */
+ return !test_bit(bus, mp_bus_not_pci);
case MP_IRQPOL_ACTIVE_HIGH:
- return IOAPIC_POL_HIGH;
+ return false;
case MP_IRQPOL_RESERVED:
pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
fallthrough;
case MP_IRQPOL_ACTIVE_LOW:
default: /* Pointless default required due to do gcc stupidity */
- return IOAPIC_POL_LOW;
+ return true;
}
}
#ifdef CONFIG_EISA
-static int eisa_irq_trigger(int idx, int bus, int trigger)
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static bool EISA_ELCR(unsigned int irq)
+{
+ if (irq < nr_legacy_irqs()) {
+ unsigned int port = 0x4d0 + (irq >> 3);
+ return (inb(port) >> (irq & 7)) & 1;
+ }
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "Broken MPtable reports ISA irq %d\n", irq);
+ return false;
+}
+
+/*
+ * EISA interrupts are always active high and can be edge or level
+ * triggered depending on the ELCR value. If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must be
+ * read in from the ELCR.
+ */
+static bool eisa_irq_is_level(int idx, int bus, bool level)
{
switch (mp_bus_id_to_type[bus]) {
case MP_BUS_PCI:
case MP_BUS_ISA:
- return trigger;
+ return level;
case MP_BUS_EISA:
- return default_EISA_trigger(idx);
+ return EISA_ELCR(mp_irqs[idx].srcbusirq);
}
pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus);
- return IOAPIC_LEVEL;
+ return true;
}
#else
-static inline int eisa_irq_trigger(int idx, int bus, int trigger)
+static inline int eisa_irq_is_level(int idx, int bus, bool level)
{
- return trigger;
+ return level;
}
#endif
-static int irq_trigger(int idx)
+static bool irq_is_level(int idx)
{
int bus = mp_irqs[idx].srcbus;
- int trigger;
+ bool level;
/*
* Determine IRQ trigger mode (edge or level sensitive):
*/
switch (mp_irqs[idx].irqflag & MP_IRQTRIG_MASK) {
case MP_IRQTRIG_DEFAULT:
- /* conforms to spec, ie. bus-type dependent trigger mode */
- if (test_bit(bus, mp_bus_not_pci))
- trigger = default_ISA_trigger(idx);
- else
- trigger = default_PCI_trigger(idx);
+ /*
+ * Conforms to spec, ie. bus-type dependent trigger
+ * mode. PCI defaults to level, ISA to edge.
+ */
+ level = !test_bit(bus, mp_bus_not_pci);
/* Take EISA into account */
- return eisa_irq_trigger(idx, bus, trigger);
+ return eisa_irq_is_level(idx, bus, level);
case MP_IRQTRIG_EDGE:
- return IOAPIC_EDGE;
+ return false;
case MP_IRQTRIG_RESERVED:
pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
fallthrough;
case MP_IRQTRIG_LEVEL:
default: /* Pointless default required due to do gcc stupidity */
- return IOAPIC_LEVEL;
+ return true;
}
}
+static int __acpi_get_override_irq(u32 gsi, bool *trigger, bool *polarity)
+{
+ int ioapic, pin, idx;
+
+ if (skip_ioapic_setup)
+ return -1;
+
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ return -1;
+
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ if (pin < 0)
+ return -1;
+
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if (idx < 0)
+ return -1;
+
+ *trigger = irq_is_level(idx);
+ *polarity = irq_active_low(idx);
+ return 0;
+}
+
+#ifdef CONFIG_ACPI
+int acpi_get_override_irq(u32 gsi, int *is_level, int *active_low)
+{
+ *is_level = *active_low = 0;
+ return __acpi_get_override_irq(gsi, (bool *)is_level,
+ (bool *)active_low);
+}
+#endif
+
void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node,
int trigger, int polarity)
{
init_irq_alloc_info(info, NULL);
info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
info->ioapic.node = node;
- info->ioapic.trigger = trigger;
- info->ioapic.polarity = polarity;
+ info->ioapic.is_level = trigger;
+ info->ioapic.active_low = polarity;
info->ioapic.valid = 1;
}
-#ifndef CONFIG_ACPI
-int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
-#endif
-
static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
struct irq_alloc_info *src,
u32 gsi, int ioapic_idx, int pin)
{
- int trigger, polarity;
+ bool level, pol_low;
copy_irq_alloc_info(dst, src);
dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
@@ -883,20 +882,20 @@ static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
dst->ioapic.valid = 1;
if (src && src->ioapic.valid) {
dst->ioapic.node = src->ioapic.node;
- dst->ioapic.trigger = src->ioapic.trigger;
- dst->ioapic.polarity = src->ioapic.polarity;
+ dst->ioapic.is_level = src->ioapic.is_level;
+ dst->ioapic.active_low = src->ioapic.active_low;
} else {
dst->ioapic.node = NUMA_NO_NODE;
- if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) {
- dst->ioapic.trigger = trigger;
- dst->ioapic.polarity = polarity;
+ if (__acpi_get_override_irq(gsi, &level, &pol_low) >= 0) {
+ dst->ioapic.is_level = level;
+ dst->ioapic.active_low = pol_low;
} else {
/*
* PCI interrupts are always active low level
* triggered.
*/
- dst->ioapic.trigger = IOAPIC_LEVEL;
- dst->ioapic.polarity = IOAPIC_POL_LOW;
+ dst->ioapic.is_level = true;
+ dst->ioapic.active_low = true;
}
}
}
@@ -906,12 +905,12 @@ static int ioapic_alloc_attr_node(struct irq_alloc_info *info)
return (info && info->ioapic.valid) ? info->ioapic.node : NUMA_NO_NODE;
}
-static void mp_register_handler(unsigned int irq, unsigned long trigger)
+static void mp_register_handler(unsigned int irq, bool level)
{
irq_flow_handler_t hdl;
bool fasteoi;
- if (trigger) {
+ if (level) {
irq_set_status_flags(irq, IRQ_LEVEL);
fasteoi = true;
} else {
@@ -933,14 +932,14 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
* pin with real trigger and polarity attributes.
*/
if (irq < nr_legacy_irqs() && data->count == 1) {
- if (info->ioapic.trigger != data->trigger)
- mp_register_handler(irq, info->ioapic.trigger);
- data->entry.trigger = data->trigger = info->ioapic.trigger;
- data->entry.polarity = data->polarity = info->ioapic.polarity;
+ if (info->ioapic.is_level != data->is_level)
+ mp_register_handler(irq, info->ioapic.is_level);
+ data->entry.is_level = data->is_level = info->ioapic.is_level;
+ data->entry.active_low = data->active_low = info->ioapic.active_low;
}
- return data->trigger == info->ioapic.trigger &&
- data->polarity == info->ioapic.polarity;
+ return data->is_level == info->ioapic.is_level &&
+ data->active_low == info->ioapic.active_low;
}
static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
@@ -1219,10 +1218,9 @@ void ioapic_zap_locks(void)
static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
{
- int i;
- char buf[256];
struct IO_APIC_route_entry entry;
- struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry;
+ char buf[256];
+ int i;
printk(KERN_DEBUG "IOAPIC %d:\n", apic);
for (i = 0; i <= nr_entries; i++) {
@@ -1230,20 +1228,21 @@ static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
snprintf(buf, sizeof(buf),
" pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
i,
- entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ",
- entry.trigger == IOAPIC_LEVEL ? "level" : "edge ",
- entry.polarity == IOAPIC_POL_LOW ? "low " : "high",
+ entry.masked ? "disabled" : "enabled ",
+ entry.is_level ? "level" : "edge ",
+ entry.active_low ? "low " : "high",
entry.vector, entry.irr, entry.delivery_status);
- if (ir_entry->format)
+ if (entry.ir_format) {
printk(KERN_DEBUG "%s, remapped, I(%04X), Z(%X)\n",
- buf, (ir_entry->index2 << 15) | ir_entry->index,
- ir_entry->zero);
- else
- printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n",
buf,
- entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ?
- "logical " : "physical",
- entry.dest, entry.delivery_mode);
+ (entry.ir_index_15 << 15) | entry.ir_index_0_14,
+ entry.ir_zero);
+ } else {
+ printk(KERN_DEBUG "%s, %s, D(%02X%02X), M(%1d)\n", buf,
+ entry.dest_mode_logical ? "logical " : "physical",
+ entry.virt_destid_8_14, entry.destid_0_7,
+ entry.delivery_mode);
+ }
}
}
@@ -1368,7 +1367,8 @@ void __init enable_IO_APIC(void)
/* If the interrupt line is enabled and in ExtInt mode
* I have found the pin where the i8259 is connected.
*/
- if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
+ if (!entry.masked &&
+ entry.delivery_mode == APIC_DELIVERY_MODE_EXTINT) {
ioapic_i8259.apic = apic;
ioapic_i8259.pin = pin;
goto found_i8259;
@@ -1410,14 +1410,16 @@ void native_restore_boot_irq_mode(void)
*/
if (ioapic_i8259.pin != -1) {
struct IO_APIC_route_entry entry;
+ u32 apic_id = read_apic_id();
memset(&entry, 0, sizeof(entry));
- entry.mask = IOAPIC_UNMASKED;
- entry.trigger = IOAPIC_EDGE;
- entry.polarity = IOAPIC_POL_HIGH;
- entry.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
- entry.delivery_mode = dest_ExtINT;
- entry.dest = read_apic_id();
+ entry.masked = false;
+ entry.is_level = false;
+ entry.active_low = false;
+ entry.dest_mode_logical = false;
+ entry.delivery_mode = APIC_DELIVERY_MODE_EXTINT;
+ entry.destid_0_7 = apic_id & 0xFF;
+ entry.virt_destid_8_14 = apic_id >> 8;
/*
* Add it to the IO-APIC irq-routing table:
@@ -1618,21 +1620,16 @@ static void __init delay_without_tsc(void)
static int __init timer_irq_works(void)
{
unsigned long t1 = jiffies;
- unsigned long flags;
if (no_timer_check)
return 1;
- local_save_flags(flags);
local_irq_enable();
-
if (boot_cpu_has(X86_FEATURE_TSC))
delay_with_tsc();
else
delay_without_tsc();
- local_irq_restore(flags);
-
/*
* Expect a few ticks at least, to be sure some possible
* glue logic does not lock up after one or two first
@@ -1641,10 +1638,10 @@ static int __init timer_irq_works(void)
* least one tick may be lost due to delays.
*/
- /* jiffies wrap? */
- if (time_after(jiffies, t1 + 4))
- return 1;
- return 0;
+ local_irq_disable();
+
+ /* Did jiffies advance? */
+ return time_after(jiffies, t1 + 4);
}
/*
@@ -1696,13 +1693,13 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data)
raw_spin_lock_irqsave(&ioapic_lock, flags);
for_each_irq_pin(entry, data->irq_2_pin) {
- unsigned int reg;
+ struct IO_APIC_route_entry e;
int pin;
pin = entry->pin;
- reg = io_apic_read(entry->apic, 0x10 + pin*2);
+ e.w1 = io_apic_read(entry->apic, 0x10 + pin*2);
/* Is the remote IRR bit set? */
- if (reg & IO_APIC_REDIR_REMOTE_IRR) {
+ if (e.irr) {
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return true;
}
@@ -1849,21 +1846,62 @@ static void ioapic_ir_ack_level(struct irq_data *irq_data)
eoi_ioapic_pin(data->entry.vector, data);
}
+/*
+ * The I/OAPIC is just a device for generating MSI messages from legacy
+ * interrupt pins. Various fields of the RTE translate into bits of the
+ * resulting MSI which had a historical meaning.
+ *
+ * With interrupt remapping, many of those bits have different meanings
+ * in the underlying MSI, but the way that the I/OAPIC transforms them
+ * from its RTE to the MSI message is the same. This function allows
+ * the parent IRQ domain to compose the MSI message, then takes the
+ * relevant bits to put them in the appropriate places in the RTE in
+ * order to generate that message when the IRQ happens.
+ *
+ * The setup here relies on a preconfigured route entry (is_level,
+ * active_low, masked) because the parent domain is merely composing the
+ * generic message routing information which is used for the MSI.
+ */
+static void ioapic_setup_msg_from_msi(struct irq_data *irq_data,
+ struct IO_APIC_route_entry *entry)
+{
+ struct msi_msg msg;
+
+ /* Let the parent domain compose the MSI message */
+ irq_chip_compose_msi_msg(irq_data, &msg);
+
+ /*
+ * - Real vector
+ * - DMAR/IR: 8bit subhandle (ioapic.pin)
+ * - AMD/IR: 8bit IRTE index
+ */
+ entry->vector = msg.arch_data.vector;
+ /* Delivery mode (for DMAR/IR all 0) */
+ entry->delivery_mode = msg.arch_data.delivery_mode;
+ /* Destination mode or DMAR/IR index bit 15 */
+ entry->dest_mode_logical = msg.arch_addr_lo.dest_mode_logical;
+ /* DMAR/IR: 1, 0 for all other modes */
+ entry->ir_format = msg.arch_addr_lo.dmar_format;
+ /*
+ * - DMAR/IR: index bit 0-14.
+ *
+ * - Virt: If the host supports x2apic without a virtualized IR
+ * unit then bit 0-6 of dmar_index_0_14 are providing bit
+ * 8-14 of the destination id.
+ *
+ * All other modes have bit 0-6 of dmar_index_0_14 cleared and the
+ * topmost 8 bits are destination id bit 0-7 (entry::destid_0_7).
+ */
+ entry->ir_index_0_14 = msg.arch_addr_lo.dmar_index_0_14;
+}
+
static void ioapic_configure_entry(struct irq_data *irqd)
{
struct mp_chip_data *mpd = irqd->chip_data;
- struct irq_cfg *cfg = irqd_cfg(irqd);
struct irq_pin_list *entry;
- /*
- * Only update when the parent is the vector domain, don't touch it
- * if the parent is the remapping domain. Check the installed
- * ioapic chip to verify that.
- */
- if (irqd->chip == &ioapic_chip) {
- mpd->entry.dest = cfg->dest_apicid;
- mpd->entry.vector = cfg->vector;
- }
+ ioapic_setup_msg_from_msi(irqd, &mpd->entry);
+
for_each_irq_pin(entry, mpd->irq_2_pin)
__ioapic_write_entry(entry->apic, entry->pin, mpd->entry);
}
@@ -1919,7 +1957,7 @@ static int ioapic_irq_get_chip_state(struct irq_data *irqd,
* irrelevant because the IO-APIC treats them as fire and
* forget.
*/
- if (rentry.irr && rentry.trigger) {
+ if (rentry.irr && rentry.is_level) {
*state = true;
break;
}
@@ -2027,6 +2065,7 @@ static inline void __init unlock_ExtINT_logic(void)
int apic, pin, i;
struct IO_APIC_route_entry entry0, entry1;
unsigned char save_control, save_freq_select;
+ u32 apic_id;
pin = find_isa_irq_pin(8, mp_INT);
if (pin == -1) {
@@ -2042,14 +2081,16 @@ static inline void __init unlock_ExtINT_logic(void)
entry0 = ioapic_read_entry(apic, pin);
clear_IO_APIC_pin(apic, pin);
+ apic_id = hard_smp_processor_id();
memset(&entry1, 0, sizeof(entry1));
- entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
- entry1.mask = IOAPIC_UNMASKED;
- entry1.dest = hard_smp_processor_id();
- entry1.delivery_mode = dest_ExtINT;
- entry1.polarity = entry0.polarity;
- entry1.trigger = IOAPIC_EDGE;
+ entry1.dest_mode_logical = true;
+ entry1.masked = false;
+ entry1.destid_0_7 = apic_id & 0xFF;
+ entry1.virt_destid_8_14 = apic_id >> 8;
+ entry1.delivery_mode = APIC_DELIVERY_MODE_EXTINT;
+ entry1.active_low = entry0.active_low;
+ entry1.is_level = false;
entry1.vector = 0;
ioapic_write_entry(apic, pin, entry1);
@@ -2117,13 +2158,12 @@ static inline void __init check_timer(void)
struct irq_cfg *cfg = irqd_cfg(irq_data);
int node = cpu_to_node(0);
int apic1, pin1, apic2, pin2;
- unsigned long flags;
int no_pin1 = 0;
if (!global_clock_event)
return;
- local_irq_save(flags);
+ local_irq_disable();
/*
* get/set the timer IRQ vector:
@@ -2178,9 +2218,9 @@ static inline void __init check_timer(void)
* so only need to unmask if it is level-trigger
* do we really have level trigger timer?
*/
- int idx;
- idx = find_irq_entry(apic1, pin1, mp_INT);
- if (idx != -1 && irq_trigger(idx))
+ int idx = find_irq_entry(apic1, pin1, mp_INT);
+
+ if (idx != -1 && irq_is_level(idx))
unmask_ioapic_irq(irq_get_irq_data(0));
}
irq_domain_deactivate_irq(irq_data);
@@ -2191,7 +2231,6 @@ static inline void __init check_timer(void)
goto out;
}
panic_if_irq_remap("timer doesn't work through Interrupt-remapped IO-APIC");
- local_irq_disable();
clear_IO_APIC_pin(apic1, pin1);
if (!no_pin1)
apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
@@ -2215,7 +2254,6 @@ static inline void __init check_timer(void)
/*
* Cleanup, just in case ...
*/
- local_irq_disable();
legacy_pic->mask(0);
clear_IO_APIC_pin(apic2, pin2);
apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
@@ -2232,7 +2270,6 @@ static inline void __init check_timer(void)
apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
- local_irq_disable();
legacy_pic->mask(0);
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
@@ -2251,7 +2288,6 @@ static inline void __init check_timer(void)
apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
goto out;
}
- local_irq_disable();
apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
if (apic_is_x2apic_enabled())
apic_printk(APIC_QUIET, KERN_INFO
@@ -2260,7 +2296,7 @@ static inline void __init check_timer(void)
panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
"report. Then try booting with the 'noapic' option.\n");
out:
- local_irq_restore(flags);
+ local_irq_enable();
}
/*
@@ -2284,36 +2320,37 @@ out:
static int mp_irqdomain_create(int ioapic)
{
- struct irq_alloc_info info;
struct irq_domain *parent;
int hwirqs = mp_ioapic_pin_count(ioapic);
struct ioapic *ip = &ioapics[ioapic];
struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
struct fwnode_handle *fn;
- char *name = "IO-APIC";
+ struct irq_fwspec fwspec;
if (cfg->type == IOAPIC_DOMAIN_INVALID)
return 0;
- init_irq_alloc_info(&info, NULL);
- info.type = X86_IRQ_ALLOC_TYPE_IOAPIC_GET_PARENT;
- info.devid = mpc_ioapic_id(ioapic);
- parent = irq_remapping_get_irq_domain(&info);
- if (!parent)
- parent = x86_vector_domain;
- else
- name = "IO-APIC-IR";
-
/* Handle device tree enumerated APICs proper */
if (cfg->dev) {
fn = of_node_to_fwnode(cfg->dev);
} else {
- fn = irq_domain_alloc_named_id_fwnode(name, ioapic);
+ fn = irq_domain_alloc_named_id_fwnode("IO-APIC", mpc_ioapic_id(ioapic));
if (!fn)
return -ENOMEM;
}
+ fwspec.fwnode = fn;
+ fwspec.param_count = 1;
+ fwspec.param[0] = mpc_ioapic_id(ioapic);
+
+ parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY);
+ if (!parent) {
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
+ return -ENODEV;
+ }
+
ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops,
(void *)(long)ioapic);
@@ -2587,30 +2624,6 @@ static int io_apic_get_version(int ioapic)
return reg_01.bits.version;
}
-int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
-{
- int ioapic, pin, idx;
-
- if (skip_ioapic_setup)
- return -1;
-
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0)
- return -1;
-
- pin = mp_find_ioapic_pin(ioapic, gsi);
- if (pin < 0)
- return -1;
-
- idx = find_irq_entry(ioapic, pin, mp_INT);
- if (idx < 0)
- return -1;
-
- *trigger = irq_trigger(idx);
- *polarity = irq_polarity(idx);
- return 0;
-}
-
/*
* This function updates target affinity of IOAPIC interrupts to include
* the CPUs which came online during SMP bringup.
@@ -2934,44 +2947,49 @@ static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
struct irq_alloc_info *info)
{
if (info && info->ioapic.valid) {
- data->trigger = info->ioapic.trigger;
- data->polarity = info->ioapic.polarity;
- } else if (acpi_get_override_irq(gsi, &data->trigger,
- &data->polarity) < 0) {
+ data->is_level = info->ioapic.is_level;
+ data->active_low = info->ioapic.active_low;
+ } else if (__acpi_get_override_irq(gsi, &data->is_level,
+ &data->active_low) < 0) {
/* PCI interrupts are always active low level triggered. */
- data->trigger = IOAPIC_LEVEL;
- data->polarity = IOAPIC_POL_LOW;
+ data->is_level = true;
+ data->active_low = true;
}
}
-static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
- struct IO_APIC_route_entry *entry)
+/*
+ * Configure the I/O-APIC specific fields in the routing entry.
+ *
+ * This is important to setup the I/O-APIC specific bits (is_level,
+ * active_low, masked) because the underlying parent domain will only
+ * provide the routing information and is oblivious of the I/O-APIC
+ * specific bits.
+ *
+ * The entry is just preconfigured at this point and not written into the
+ * RTE. This happens later during activation which will fill in the actual
+ * routing information.
+ */
+static void mp_preconfigure_entry(struct mp_chip_data *data)
{
+ struct IO_APIC_route_entry *entry = &data->entry;
+
memset(entry, 0, sizeof(*entry));
- entry->delivery_mode = apic->irq_delivery_mode;
- entry->dest_mode = apic->irq_dest_mode;
- entry->dest = cfg->dest_apicid;
- entry->vector = cfg->vector;
- entry->trigger = data->trigger;
- entry->polarity = data->polarity;
+ entry->is_level = data->is_level;
+ entry->active_low = data->active_low;
/*
* Mask level triggered irqs. Edge triggered irqs are masked
* by the irq core code in case they fire.
*/
- if (data->trigger == IOAPIC_LEVEL)
- entry->mask = IOAPIC_MASKED;
- else
- entry->mask = IOAPIC_UNMASKED;
+ entry->masked = data->is_level;
}
int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
unsigned int nr_irqs, void *arg)
{
- int ret, ioapic, pin;
- struct irq_cfg *cfg;
- struct irq_data *irq_data;
- struct mp_chip_data *data;
struct irq_alloc_info *info = arg;
+ struct mp_chip_data *data;
+ struct irq_data *irq_data;
+ int ret, ioapic, pin;
unsigned long flags;
if (!info || nr_irqs > 1)
@@ -2989,7 +3007,6 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
if (!data)
return -ENOMEM;
- info->ioapic.entry = &data->entry;
ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
if (ret < 0) {
kfree(data);
@@ -3003,22 +3020,20 @@ int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
irq_data->chip_data = data;
mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info);
- cfg = irqd_cfg(irq_data);
add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin);
+ mp_preconfigure_entry(data);
+ mp_register_handler(virq, data->is_level);
+
local_irq_save(flags);
- if (info->ioapic.entry)
- mp_setup_entry(cfg, data, info->ioapic.entry);
- mp_register_handler(virq, data->trigger);
if (virq < nr_legacy_irqs())
legacy_pic->mask(virq);
local_irq_restore(flags);
apic_printk(APIC_VERBOSE, KERN_DEBUG
- "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n",
- ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector,
- virq, data->trigger, data->polarity, cfg->dest_apicid);
-
+ "IOAPIC[%d]: Preconfigured routing entry (%d-%d -> IRQ %d Level:%i ActiveLow:%i)\n",
+ ioapic, mpc_ioapic_id(ioapic), pin, virq,
+ data->is_level, data->active_low);
return 0;
}
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 387154e39e08..d1fb874fbe64 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -260,7 +260,7 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
for_each_cpu(query_cpu, mask)
__default_send_IPI_dest_field(
early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, apic->dest_logical);
+ vector, APIC_DEST_LOGICAL);
local_irq_restore(flags);
}
@@ -279,7 +279,7 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
continue;
__default_send_IPI_dest_field(
early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
- vector, apic->dest_logical);
+ vector, APIC_DEST_LOGICAL);
}
local_irq_restore(flags);
}
@@ -297,7 +297,7 @@ void default_send_IPI_mask_logical(const struct cpumask *cpumask, int vector)
local_irq_save(flags);
WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
- __default_send_IPI_dest_field(mask, vector, apic->dest_logical);
+ __default_send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL);
local_irq_restore(flags);
}
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 6313f0a05db7..44ebe25e7703 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -15,7 +15,6 @@
#include <linux/hpet.h>
#include <linux/msi.h>
#include <asm/irqdomain.h>
-#include <asm/msidef.h>
#include <asm/hpet.h>
#include <asm/hw_irq.h>
#include <asm/apic.h>
@@ -23,38 +22,11 @@
struct irq_domain *x86_pci_msi_default_domain __ro_after_init;
-static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg)
-{
- msg->address_hi = MSI_ADDR_BASE_HI;
-
- if (x2apic_enabled())
- msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid);
-
- msg->address_lo =
- MSI_ADDR_BASE_LO |
- ((apic->irq_dest_mode == 0) ?
- MSI_ADDR_DEST_MODE_PHYSICAL :
- MSI_ADDR_DEST_MODE_LOGICAL) |
- MSI_ADDR_REDIRECTION_CPU |
- MSI_ADDR_DEST_ID(cfg->dest_apicid);
-
- msg->data =
- MSI_DATA_TRIGGER_EDGE |
- MSI_DATA_LEVEL_ASSERT |
- MSI_DATA_DELIVERY_FIXED |
- MSI_DATA_VECTOR(cfg->vector);
-}
-
-void x86_vector_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
-{
- __irq_msi_compose_msg(irqd_cfg(data), msg);
-}
-
static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
{
struct msi_msg msg[2] = { [1] = { }, };
- __irq_msi_compose_msg(cfg, msg);
+ __irq_msi_compose_msg(cfg, msg, false);
irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
}
@@ -276,6 +248,17 @@ struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent,
#endif
#ifdef CONFIG_DMAR_TABLE
+/*
+ * The Intel IOMMU (ab)uses the high bits of the MSI address to contain the
+ * high bits of the destination APIC ID. This can't be done in the general
+ * case for MSIs as it would be targeting real memory above 4GiB not the
+ * APIC.
+ */
+static void dmar_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ __irq_msi_compose_msg(irqd_cfg(data), msg, true);
+}
+
static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
{
dmar_msi_write(data->irq, msg);
@@ -288,6 +271,7 @@ static struct irq_chip dmar_msi_controller = {
.irq_ack = irq_chip_ack_parent,
.irq_set_affinity = msi_domain_set_affinity,
.irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_compose_msi_msg = dmar_msi_compose_msg,
.irq_write_msi_msg = dmar_msi_write_msg,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
@@ -356,114 +340,3 @@ void dmar_free_hwirq(int irq)
irq_domain_free_irqs(irq, 1);
}
#endif
-
-/*
- * MSI message composition
- */
-#ifdef CONFIG_HPET_TIMER
-static inline int hpet_dev_id(struct irq_domain *domain)
-{
- struct msi_domain_info *info = msi_get_domain_info(domain);
-
- return (int)(long)info->data;
-}
-
-static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
-{
- hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
-}
-
-static struct irq_chip hpet_msi_controller __ro_after_init = {
- .name = "HPET-MSI",
- .irq_unmask = hpet_msi_unmask,
- .irq_mask = hpet_msi_mask,
- .irq_ack = irq_chip_ack_parent,
- .irq_set_affinity = msi_domain_set_affinity,
- .irq_retrigger = irq_chip_retrigger_hierarchy,
- .irq_write_msi_msg = hpet_msi_write_msg,
- .flags = IRQCHIP_SKIP_SET_WAKE,
-};
-
-static int hpet_msi_init(struct irq_domain *domain,
- struct msi_domain_info *info, unsigned int virq,
- irq_hw_number_t hwirq, msi_alloc_info_t *arg)
-{
- irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
- irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL,
- handle_edge_irq, arg->data, "edge");
-
- return 0;
-}
-
-static void hpet_msi_free(struct irq_domain *domain,
- struct msi_domain_info *info, unsigned int virq)
-{
- irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
-}
-
-static struct msi_domain_ops hpet_msi_domain_ops = {
- .msi_init = hpet_msi_init,
- .msi_free = hpet_msi_free,
-};
-
-static struct msi_domain_info hpet_msi_domain_info = {
- .ops = &hpet_msi_domain_ops,
- .chip = &hpet_msi_controller,
- .flags = MSI_FLAG_USE_DEF_DOM_OPS,
-};
-
-struct irq_domain *hpet_create_irq_domain(int hpet_id)
-{
- struct msi_domain_info *domain_info;
- struct irq_domain *parent, *d;
- struct irq_alloc_info info;
- struct fwnode_handle *fn;
-
- if (x86_vector_domain == NULL)
- return NULL;
-
- domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL);
- if (!domain_info)
- return NULL;
-
- *domain_info = hpet_msi_domain_info;
- domain_info->data = (void *)(long)hpet_id;
-
- init_irq_alloc_info(&info, NULL);
- info.type = X86_IRQ_ALLOC_TYPE_HPET_GET_PARENT;
- info.devid = hpet_id;
- parent = irq_remapping_get_irq_domain(&info);
- if (parent == NULL)
- parent = x86_vector_domain;
- else
- hpet_msi_controller.name = "IR-HPET-MSI";
-
- fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name,
- hpet_id);
- if (!fn) {
- kfree(domain_info);
- return NULL;
- }
-
- d = msi_create_irq_domain(fn, domain_info, parent);
- if (!d) {
- irq_domain_free_fwnode(fn);
- kfree(domain_info);
- }
- return d;
-}
-
-int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc,
- int dev_num)
-{
- struct irq_alloc_info info;
-
- init_irq_alloc_info(&info, NULL);
- info.type = X86_IRQ_ALLOC_TYPE_HPET;
- info.data = hc;
- info.devid = hpet_dev_id(domain);
- info.hwirq = dev_num;
-
- return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
-}
-#endif
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 67b6f7c049ec..a61f642b1b90 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -69,16 +69,13 @@ static struct apic apic_default __ro_after_init = {
.apic_id_valid = default_apic_id_valid,
.apic_id_registered = default_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- /* logical delivery broadcast to all CPUs: */
- .irq_dest_mode = 1,
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = true,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = default_check_apicid_used,
+ .check_apicid_used = default_check_apicid_used,
.init_apic_ldr = default_init_apic_ldr,
-
.ioapic_phys_id_map = default_ioapic_phys_id_map,
.setup_apic_routing = setup_apic_flat_routing,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 758bbf25ef74..3c9c7492252f 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -640,7 +640,50 @@ static void x86_vector_debug_show(struct seq_file *m, struct irq_domain *d,
}
#endif
+int x86_fwspec_is_ioapic(struct irq_fwspec *fwspec)
+{
+ if (fwspec->param_count != 1)
+ return 0;
+
+ if (is_fwnode_irqchip(fwspec->fwnode)) {
+ const char *fwname = fwnode_get_name(fwspec->fwnode);
+ return fwname && !strncmp(fwname, "IO-APIC-", 8) &&
+ simple_strtol(fwname+8, NULL, 10) == fwspec->param[0];
+ }
+ return to_of_node(fwspec->fwnode) &&
+ of_device_is_compatible(to_of_node(fwspec->fwnode),
+ "intel,ce4100-ioapic");
+}
+
+int x86_fwspec_is_hpet(struct irq_fwspec *fwspec)
+{
+ if (fwspec->param_count != 1)
+ return 0;
+
+ if (is_fwnode_irqchip(fwspec->fwnode)) {
+ const char *fwname = fwnode_get_name(fwspec->fwnode);
+ return fwname && !strncmp(fwname, "HPET-MSI-", 9) &&
+ simple_strtol(fwname+9, NULL, 10) == fwspec->param[0];
+ }
+ return 0;
+}
+
+static int x86_vector_select(struct irq_domain *d, struct irq_fwspec *fwspec,
+ enum irq_domain_bus_token bus_token)
+{
+ /*
+ * HPET and I/OAPIC cannot be parented in the vector domain
+ * if IRQ remapping is enabled. APIC IDs above 15 bits are
+ * only permitted if IRQ remapping is enabled, so check that.
+ */
+ if (apic->apic_id_valid(32768))
+ return 0;
+
+ return x86_fwspec_is_ioapic(fwspec) || x86_fwspec_is_hpet(fwspec);
+}
+
static const struct irq_domain_ops x86_vector_domain_ops = {
+ .select = x86_vector_select,
.alloc = x86_vector_alloc_irqs,
.free = x86_vector_free_irqs,
.activate = x86_vector_activate,
@@ -822,6 +865,12 @@ void apic_ack_edge(struct irq_data *irqd)
apic_ack_irq(irqd);
}
+static void x86_vector_msi_compose_msg(struct irq_data *data,
+ struct msi_msg *msg)
+{
+ __irq_msi_compose_msg(irqd_cfg(data), msg, false);
+}
+
static struct irq_chip lapic_controller = {
.name = "APIC",
.irq_ack = apic_ack_edge,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index b0889c48a2ac..df6adc5674c9 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -61,7 +61,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
if (!dest)
continue;
- __x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
+ __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
/* Remove cluster CPUs from tmpmask */
cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask);
}
@@ -184,15 +184,13 @@ static struct apic apic_x2apic_cluster __ro_after_init = {
.apic_id_valid = x2apic_apic_id_valid,
.apic_id_registered = x2apic_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 1, /* logical */
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = true,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = NULL,
+ .check_apicid_used = NULL,
.init_apic_ldr = init_x2apic_ldr,
-
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index bc9693841353..0e4e81971567 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -8,6 +8,12 @@
int x2apic_phys;
static struct apic apic_x2apic_phys;
+static u32 x2apic_max_apicid __ro_after_init;
+
+void __init x2apic_set_max_apicid(u32 apicid)
+{
+ x2apic_max_apicid = apicid;
+}
static int __init set_x2apic_phys_mode(char *arg)
{
@@ -98,6 +104,9 @@ static int x2apic_phys_probe(void)
/* Common x2apic functions, also used by x2apic_cluster */
int x2apic_apic_id_valid(u32 apicid)
{
+ if (x2apic_max_apicid && apicid > x2apic_max_apicid)
+ return 0;
+
return 1;
}
@@ -148,15 +157,13 @@ static struct apic apic_x2apic_phys __ro_after_init = {
.apic_id_valid = x2apic_apic_id_valid,
.apic_id_registered = x2apic_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* physical */
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = false,
.disable_esr = 0,
- .dest_logical = 0,
- .check_apicid_used = NULL,
+ .check_apicid_used = NULL,
.init_apic_ldr = init_x2apic_ldr,
-
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 235f5cde06fc..52bc217ca8c3 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -502,6 +502,18 @@ enum uv_system_type get_uv_system_type(void)
return uv_system_type;
}
+int uv_get_hubless_system(void)
+{
+ return uv_hubless_system;
+}
+EXPORT_SYMBOL_GPL(uv_get_hubless_system);
+
+ssize_t uv_get_archtype(char *buf, int len)
+{
+ return scnprintf(buf, len, "%s/%s", uv_archtype, oem_table_id);
+}
+EXPORT_SYMBOL_GPL(uv_get_archtype);
+
int is_uv_system(void)
{
return uv_system_type != UV_NONE;
@@ -716,9 +728,9 @@ static void uv_send_IPI_one(int cpu, int vector)
unsigned long dmode, val;
if (vector == NMI_VECTOR)
- dmode = dest_NMI;
+ dmode = APIC_DELIVERY_MODE_NMI;
else
- dmode = dest_Fixed;
+ dmode = APIC_DELIVERY_MODE_FIXED;
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(apicid << UVH_IPI_INT_APIC_ID_SHFT) |
@@ -820,15 +832,13 @@ static struct apic apic_x2apic_uv_x __ro_after_init = {
.apic_id_valid = uv_apic_id_valid,
.apic_id_registered = uv_apic_id_registered,
- .irq_delivery_mode = dest_Fixed,
- .irq_dest_mode = 0, /* Physical */
+ .delivery_mode = APIC_DELIVERY_MODE_FIXED,
+ .dest_mode_logical = false,
.disable_esr = 0,
- .dest_logical = APIC_DEST_LOGICAL,
- .check_apicid_used = NULL,
+ .check_apicid_used = NULL,
.init_apic_ldr = uv_init_apic_ldr,
-
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
@@ -1603,21 +1613,30 @@ static void check_efi_reboot(void)
reboot_type = BOOT_ACPI;
}
-/* Setup user proc fs files */
+/*
+ * User proc fs file handling now deprecated.
+ * Recommend using /sys/firmware/sgi_uv/... instead.
+ */
static int __maybe_unused proc_hubbed_show(struct seq_file *file, void *data)
{
+ pr_notice_once("%s: using deprecated /proc/sgi_uv/hubbed, use /sys/firmware/sgi_uv/hub_type\n",
+ current->comm);
seq_printf(file, "0x%x\n", uv_hubbed_system);
return 0;
}
static int __maybe_unused proc_hubless_show(struct seq_file *file, void *data)
{
+ pr_notice_once("%s: using deprecated /proc/sgi_uv/hubless, use /sys/firmware/sgi_uv/hubless\n",
+ current->comm);
seq_printf(file, "0x%x\n", uv_hubless_system);
return 0;
}
static int __maybe_unused proc_archtype_show(struct seq_file *file, void *data)
{
+ pr_notice_once("%s: using deprecated /proc/sgi_uv/archtype, use /sys/firmware/sgi_uv/archtype\n",
+ current->comm);
seq_printf(file, "%s/%s\n", uv_archtype, oem_table_id);
return 0;
}
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 70b7154f4bdd..60b9f42ce3c1 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -66,7 +66,6 @@ static void __used common(void)
OFFSET(PV_IRQ_irq_disable, paravirt_patch_template, irq.irq_disable);
OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable);
OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret);
- OFFSET(PV_MMU_read_cr2, paravirt_patch_template, mmu.read_cr2);
#endif
#ifdef CONFIG_XEN
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 93792b457b81..637b499450d1 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_X86_MCE) += mce/
obj-$(CONFIG_MTRR) += mtrr/
obj-$(CONFIG_MICROCODE) += microcode/
obj-$(CONFIG_X86_CPU_RESCTRL) += resctrl/
+obj-$(CONFIG_X86_SGX) += sgx/
obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 6062ce586b95..f8ca66f3d861 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -23,7 +23,6 @@
#ifdef CONFIG_X86_64
# include <asm/mmconfig.h>
-# include <asm/set_memory.h>
#endif
#include "cpu.h"
@@ -330,7 +329,6 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
*/
static void amd_get_topology(struct cpuinfo_x86 *c)
{
- u8 node_id;
int cpu = smp_processor_id();
/* get information required for multi-node processors */
@@ -340,7 +338,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
- node_id = ecx & 0xff;
+ c->cpu_die_id = ecx & 0xff;
if (c->x86 == 0x15)
c->cu_id = ebx & 0xff;
@@ -360,15 +358,15 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
if (!err)
c->x86_coreid_bits = get_count_order(c->x86_max_cores);
- cacheinfo_amd_init_llc_id(c, cpu, node_id);
+ cacheinfo_amd_init_llc_id(c, cpu);
} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
- node_id = value & 7;
+ c->cpu_die_id = value & 7;
- per_cpu(cpu_llc_id, cpu) = node_id;
+ per_cpu(cpu_llc_id, cpu) = c->cpu_die_id;
} else
return;
@@ -393,7 +391,7 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
/* Convert the initial APIC ID into the socket ID */
c->phys_proc_id = c->initial_apicid >> bits;
/* use socket ID also for last level cache */
- per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
+ per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id;
}
static void amd_detect_ppin(struct cpuinfo_x86 *c)
@@ -425,12 +423,6 @@ clear_ppin:
clear_cpu_cap(c, X86_FEATURE_AMD_PPIN);
}
-u16 amd_get_nb_id(int cpu)
-{
- return per_cpu(cpu_llc_id, cpu);
-}
-EXPORT_SYMBOL_GPL(amd_get_nb_id);
-
u32 amd_get_nodes_per_socket(void)
{
return nodes_per_socket;
@@ -516,26 +508,6 @@ static void early_init_amd_mc(struct cpuinfo_x86 *c)
static void bsp_init_amd(struct cpuinfo_x86 *c)
{
-
-#ifdef CONFIG_X86_64
- if (c->x86 >= 0xf) {
- unsigned long long tseg;
-
- /*
- * Split up direct mapping around the TSEG SMM area.
- * Don't do it for gbpages because there seems very little
- * benefit in doing so.
- */
- if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
- unsigned long pfn = tseg >> PAGE_SHIFT;
-
- pr_debug("tseg: %010llx\n", tseg);
- if (pfn_range_is_mapped(pfn, pfn + 1))
- set_memory_4k((unsigned long)__va(tseg), 1);
- }
- }
-#endif
-
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
if (c->x86 > 0x10 ||
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index e2f319dc992d..22911deacb6e 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -14,11 +14,13 @@
#include <linux/cpufreq.h>
#include <linux/smp.h>
#include <linux/sched/isolation.h>
+#include <linux/rcupdate.h>
#include "cpu.h"
struct aperfmperf_sample {
unsigned int khz;
+ atomic_t scfpending;
ktime_t time;
u64 aperf;
u64 mperf;
@@ -62,17 +64,20 @@ static void aperfmperf_snapshot_khz(void *dummy)
s->aperf = aperf;
s->mperf = mperf;
s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
+ atomic_set_release(&s->scfpending, 0);
}
static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
{
s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu));
+ struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
/* Don't bother re-computing within the cache threshold time. */
if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
return true;
- smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait);
+ if (!atomic_xchg(&s->scfpending, 1) || wait)
+ smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait);
/* Return false if the previous iteration was too long ago. */
return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
@@ -89,6 +94,9 @@ unsigned int aperfmperf_get_khz(int cpu)
if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
return 0;
+ if (rcu_is_idle_cpu(cpu))
+ return 0; /* Idle CPUs are completely uninteresting. */
+
aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
return per_cpu(samples.khz, cpu);
}
@@ -108,6 +116,8 @@ void arch_freq_prepare_all(void)
for_each_online_cpu(cpu) {
if (!housekeeping_cpu(cpu, HK_FLAG_MISC))
continue;
+ if (rcu_is_idle_cpu(cpu))
+ continue; /* Idle CPUs are completely uninteresting. */
if (!aperfmperf_snapshot_cpu(cpu, now, false))
wait = true;
}
@@ -118,6 +128,8 @@ void arch_freq_prepare_all(void)
unsigned int arch_freq_get_on_cpu(int cpu)
{
+ struct aperfmperf_sample *s = per_cpu_ptr(&samples, cpu);
+
if (!cpu_khz)
return 0;
@@ -131,6 +143,8 @@ unsigned int arch_freq_get_on_cpu(int cpu)
return per_cpu(samples.khz, cpu);
msleep(APERFMPERF_REFRESH_DELAY_MS);
+ atomic_set(&s->scfpending, 1);
+ smp_mb(); /* ->scfpending before smp_call_function_single(). */
smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
return per_cpu(samples.khz, cpu);
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 57074cf3ad7c..3ca9be482a9e 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -580,7 +580,7 @@ static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
if (index < 3)
return;
- node = amd_get_nb_id(smp_processor_id());
+ node = topology_die_id(smp_processor_id());
this_leaf->nb = node_to_amd_nb(node);
if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
amd_calc_l3_indices(this_leaf->nb);
@@ -646,7 +646,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
return i;
}
-void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
+void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu)
{
/*
* We may have multiple LLCs if L3 caches exist, so check if we
@@ -657,7 +657,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
if (c->x86 < 0x17) {
/* LLC is at the node level. */
- per_cpu(cpu_llc_id, cpu) = node_id;
+ per_cpu(cpu_llc_id, cpu) = c->cpu_die_id;
} else if (c->x86 == 0x17 && c->x86_model <= 0x1F) {
/*
* LLC is at the core complex level.
@@ -684,7 +684,7 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
}
}
-void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id)
+void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu)
{
/*
* We may have multiple LLCs if L3 caches exist, so check if we
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index d502241995a3..42af31b64c2c 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -69,6 +69,7 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_CQM_MBM_TOTAL, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_CQM_MBM_LOCAL, X86_FEATURE_CQM_LLC },
{ X86_FEATURE_AVX512_BF16, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW },
{ X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES },
{ X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA },
{}
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 29a3bedabd06..3b1b01f2b248 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -93,16 +93,41 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c)
}
#endif /* CONFIG_X86_VMX_FEATURE_NAMES */
+static void clear_sgx_caps(void)
+{
+ setup_clear_cpu_cap(X86_FEATURE_SGX);
+ setup_clear_cpu_cap(X86_FEATURE_SGX_LC);
+}
+
+static int __init nosgx(char *str)
+{
+ clear_sgx_caps();
+
+ return 0;
+}
+
+early_param("nosgx", nosgx);
+
void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
{
bool tboot = tboot_enabled();
+ bool enable_sgx;
u64 msr;
if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
clear_cpu_cap(c, X86_FEATURE_VMX);
+ clear_sgx_caps();
return;
}
+ /*
+ * Enable SGX if and only if the kernel supports SGX and Launch Control
+ * is supported, i.e. disable SGX if the LE hash MSRs can't be written.
+ */
+ enable_sgx = cpu_has(c, X86_FEATURE_SGX) &&
+ cpu_has(c, X86_FEATURE_SGX_LC) &&
+ IS_ENABLED(CONFIG_X86_SGX);
+
if (msr & FEAT_CTL_LOCKED)
goto update_caps;
@@ -124,13 +149,16 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
}
+ if (enable_sgx)
+ msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED;
+
wrmsrl(MSR_IA32_FEAT_CTL, msr);
update_caps:
set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL);
if (!cpu_has(c, X86_FEATURE_VMX))
- return;
+ goto update_sgx;
if ( (tboot && !(msr & FEAT_CTL_VMX_ENABLED_INSIDE_SMX)) ||
(!tboot && !(msr & FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX))) {
@@ -143,4 +171,12 @@ update_caps:
init_vmx_capabilities(c);
#endif
}
+
+update_sgx:
+ if (!(msr & FEAT_CTL_SGX_ENABLED) ||
+ !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) {
+ if (enable_sgx)
+ pr_err_once("SGX disabled by BIOS\n");
+ clear_sgx_caps();
+ }
}
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index ac6c30e5801d..ae59115d18f9 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -14,9 +14,6 @@
#include <asm/cacheinfo.h>
#include <asm/spec-ctrl.h>
#include <asm/delay.h>
-#ifdef CONFIG_X86_64
-# include <asm/set_memory.h>
-#endif
#include "cpu.h"
@@ -65,7 +62,6 @@ static void hygon_get_topology_early(struct cpuinfo_x86 *c)
*/
static void hygon_get_topology(struct cpuinfo_x86 *c)
{
- u8 node_id;
int cpu = smp_processor_id();
/* get information required for multi-node processors */
@@ -75,7 +71,7 @@ static void hygon_get_topology(struct cpuinfo_x86 *c)
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
- node_id = ecx & 0xff;
+ c->cpu_die_id = ecx & 0xff;
c->cpu_core_id = ebx & 0xff;
@@ -93,14 +89,14 @@ static void hygon_get_topology(struct cpuinfo_x86 *c)
/* Socket ID is ApicId[6] for these processors. */
c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT;
- cacheinfo_hygon_init_llc_id(c, cpu, node_id);
+ cacheinfo_hygon_init_llc_id(c, cpu);
} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
- node_id = value & 7;
+ c->cpu_die_id = value & 7;
- per_cpu(cpu_llc_id, cpu) = node_id;
+ per_cpu(cpu_llc_id, cpu) = c->cpu_die_id;
} else
return;
@@ -123,7 +119,7 @@ static void hygon_detect_cmp(struct cpuinfo_x86 *c)
/* Convert the initial APIC ID into the socket ID */
c->phys_proc_id = c->initial_apicid >> bits;
/* use socket ID also for last level cache */
- per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
+ per_cpu(cpu_llc_id, cpu) = c->cpu_die_id = c->phys_proc_id;
}
static void srat_detect_node(struct cpuinfo_x86 *c)
@@ -204,23 +200,6 @@ static void early_init_hygon_mc(struct cpuinfo_x86 *c)
static void bsp_init_hygon(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_64
- unsigned long long tseg;
-
- /*
- * Split up direct mapping around the TSEG SMM area.
- * Don't do it for gbpages because there seems very little
- * benefit in doing so.
- */
- if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
- unsigned long pfn = tseg >> PAGE_SHIFT;
-
- pr_debug("tseg: %010llx\n", tseg);
- if (pfn_range_is_mapped(pfn, pfn + 1))
- set_memory_4k((unsigned long)__va(tseg), 1);
- }
-#endif
-
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
u64 val;
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 0c6b02dd744c..e486f96b3cb3 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -1341,7 +1341,7 @@ static int threshold_create_bank(struct threshold_bank **bp, unsigned int cpu,
return -ENODEV;
if (is_shared_bank(bank)) {
- nb = node_to_amd_nb(amd_get_nb_id(cpu));
+ nb = node_to_amd_nb(topology_die_id(cpu));
/* threshold descriptor already initialized on this node? */
if (nb && nb->bank4) {
@@ -1445,7 +1445,7 @@ static void threshold_remove_bank(struct threshold_bank *bank)
* The last CPU on this node using the shared bank is going
* away, remove that bank now.
*/
- nb = node_to_amd_nb(amd_get_nb_id(smp_processor_id()));
+ nb = node_to_amd_nb(topology_die_id(smp_processor_id()));
nb->bank4 = NULL;
}
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index af8d37962586..b58b85380ddb 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -51,6 +51,67 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
}
EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
+int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
+{
+ const u64 *i_mce = ((const u64 *) (ctx_info + 1));
+ unsigned int cpu;
+ struct mce m;
+
+ if (!boot_cpu_has(X86_FEATURE_SMCA))
+ return -EINVAL;
+
+ /*
+ * The starting address of the register array extracted from BERT must
+ * match with the first expected register in the register layout of
+ * SMCA address space. This address corresponds to banks's MCA_STATUS
+ * register.
+ *
+ * Match any MCi_STATUS register by turning off bank numbers.
+ */
+ if ((ctx_info->msr_addr & MSR_AMD64_SMCA_MC0_STATUS) !=
+ MSR_AMD64_SMCA_MC0_STATUS)
+ return -EINVAL;
+
+ /*
+ * The register array size must be large enough to include all the
+ * SMCA registers which need to be extracted.
+ *
+ * The number of registers in the register array is determined by
+ * Register Array Size/8 as defined in UEFI spec v2.8, sec N.2.4.2.2.
+ * The register layout is fixed and currently the raw data in the
+ * register array includes 6 SMCA registers which the kernel can
+ * extract.
+ */
+ if (ctx_info->reg_arr_size < 48)
+ return -EINVAL;
+
+ mce_setup(&m);
+
+ m.extcpu = -1;
+ m.socketid = -1;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu_data(cpu).initial_apicid == lapic_id) {
+ m.extcpu = cpu;
+ m.socketid = cpu_data(m.extcpu).phys_proc_id;
+ break;
+ }
+ }
+
+ m.apicid = lapic_id;
+ m.bank = (ctx_info->msr_addr >> 4) & 0xFF;
+ m.status = *i_mce;
+ m.addr = *(i_mce + 1);
+ m.misc = *(i_mce + 2);
+ /* Skipping MCA_CONFIG */
+ m.ipid = *(i_mce + 4);
+ m.synd = *(i_mce + 5);
+
+ mce_log(&m);
+
+ return 0;
+}
+
#define CPER_CREATOR_MCE \
GUID_INIT(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
0x64, 0x90, 0xb8, 0x9d)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 32b7099e3511..13d3f1cbda17 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -162,7 +162,8 @@ EXPORT_SYMBOL_GPL(mce_log);
void mce_register_decode_chain(struct notifier_block *nb)
{
- if (WARN_ON(nb->priority > MCE_PRIO_MCELOG && nb->priority < MCE_PRIO_EDAC))
+ if (WARN_ON(nb->priority < MCE_PRIO_LOWEST ||
+ nb->priority > MCE_PRIO_HIGHEST))
return;
blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
@@ -1265,14 +1266,14 @@ static void kill_me_maybe(struct callback_head *cb)
}
}
-static void queue_task_work(struct mce *m, int kill_it)
+static void queue_task_work(struct mce *m, int kill_current_task)
{
current->mce_addr = m->addr;
current->mce_kflags = m->kflags;
current->mce_ripv = !!(m->mcgstatus & MCG_STATUS_RIPV);
current->mce_whole_page = whole_page(m);
- if (kill_it)
+ if (kill_current_task)
current->mce_kill_me.func = kill_me_now;
else
current->mce_kill_me.func = kill_me_maybe;
@@ -1320,10 +1321,10 @@ noinstr void do_machine_check(struct pt_regs *regs)
int no_way_out = 0;
/*
- * If kill_it gets set, there might be a way to recover from this
+ * If kill_current_task is not set, there might be a way to recover from this
* error.
*/
- int kill_it = 0;
+ int kill_current_task = 0;
/*
* MCEs are always local on AMD. Same is determined by MCG_STATUS_LMCES
@@ -1350,8 +1351,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
* severity is MCE_AR_SEVERITY we have other options.
*/
if (!(m.mcgstatus & MCG_STATUS_RIPV))
- kill_it = 1;
-
+ kill_current_task = (cfg->tolerant == 3) ? 0 : 1;
/*
* Check if this MCE is signaled to only this logical processor,
* on Intel, Zhaoxin only.
@@ -1368,7 +1368,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
* to see it will clear it.
*/
if (lmce) {
- if (no_way_out)
+ if (no_way_out && cfg->tolerant < 3)
mce_panic("Fatal local machine check", &m, msg);
} else {
order = mce_start(&no_way_out);
@@ -1387,6 +1387,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
if (mce_end(order) < 0) {
if (!no_way_out)
no_way_out = worst >= MCE_PANIC_SEVERITY;
+
+ if (no_way_out && cfg->tolerant < 3)
+ mce_panic("Fatal machine check on current CPU", &m, msg);
}
} else {
/*
@@ -1403,19 +1406,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
}
}
- /*
- * If tolerant is at an insane level we drop requests to kill
- * processes and continue even when there is no way out.
- */
- if (cfg->tolerant == 3)
- kill_it = 0;
- else if (no_way_out)
- mce_panic("Fatal machine check on current CPU", &m, msg);
-
- if (worst > 0)
- irq_work_queue(&mce_irq_work);
-
- if (worst != MCE_AR_SEVERITY && !kill_it)
+ if (worst != MCE_AR_SEVERITY && !kill_current_task)
goto out;
/* Fault was in user mode and we need to take some action */
@@ -1423,7 +1414,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
/* If this triggers there is no way to recover. Die hard. */
BUG_ON(!on_thread_stack() || !user_mode(regs));
- queue_task_work(&m, kill_it);
+ queue_task_work(&m, kill_current_task);
} else {
/*
@@ -1441,7 +1432,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
}
if (m.kflags & MCE_IN_KERNEL_COPYIN)
- queue_task_work(&m, kill_it);
+ queue_task_work(&m, kill_current_task);
}
out:
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
@@ -1583,7 +1574,7 @@ static void __mcheck_cpu_mce_banks_init(void)
* __mcheck_cpu_init_clear_banks() does the final bank setup.
*/
b->ctl = -1ULL;
- b->init = 1;
+ b->init = true;
}
}
@@ -1764,7 +1755,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
*/
if (c->x86 == 6 && c->x86_model < 0x1A && this_cpu_read(mce_num_banks) > 0)
- mce_banks[0].init = 0;
+ mce_banks[0].init = false;
/*
* All newer Intel systems support MCE broadcasting. Enable
@@ -1813,11 +1804,9 @@ static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
case X86_VENDOR_INTEL:
intel_p5_mcheck_init(c);
return 1;
- break;
case X86_VENDOR_CENTAUR:
winchip_mcheck_init(c);
return 1;
- break;
default:
return 0;
}
@@ -1985,7 +1974,7 @@ void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check;
static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
{
- bool irq_state;
+ irqentry_state_t irq_state;
WARN_ON_ONCE(user_mode(regs));
@@ -1997,7 +1986,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
mce_check_crashing_cpu())
return;
- irq_state = idtentry_enter_nmi(regs);
+ irq_state = irqentry_nmi_enter(regs);
/*
* The call targets are marked noinstr, but objtool can't figure
* that out because it's an indirect call. Annotate it.
@@ -2008,7 +1997,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
if (regs->flags & X86_EFLAGS_IF)
trace_hardirqs_on_prepare();
instrumentation_end();
- idtentry_exit_nmi(regs, irq_state);
+ irqentry_nmi_exit(regs, irq_state);
}
static __always_inline void exc_machine_check_user(struct pt_regs *regs)
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 3a44346f2276..7b360731fc2d 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -522,8 +522,8 @@ static void do_inject(void)
if (boot_cpu_has(X86_FEATURE_AMD_DCM) &&
b == 4 &&
boot_cpu_data.x86 < 0x17) {
- toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu));
- cpu = get_nbc_for_node(amd_get_nb_id(cpu));
+ toggle_nb_mca_mst_cpu(topology_die_id(cpu));
+ cpu = get_nbc_for_node(topology_die_id(cpu));
}
get_online_cpus();
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index abe9fe0fb851..c2476fe0682e 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -509,12 +509,33 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
}
}
+/*
+ * Enable additional error logs from the integrated
+ * memory controller on processors that support this.
+ */
+static void intel_imc_init(struct cpuinfo_x86 *c)
+{
+ u64 error_control;
+
+ switch (c->x86_model) {
+ case INTEL_FAM6_SANDYBRIDGE_X:
+ case INTEL_FAM6_IVYBRIDGE_X:
+ case INTEL_FAM6_HASWELL_X:
+ if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control))
+ return;
+ error_control |= 2;
+ wrmsrl_safe(MSR_ERROR_CONTROL, error_control);
+ break;
+ }
+}
+
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
intel_init_cmci();
intel_init_lmce();
intel_ppin_init(c);
+ intel_imc_init(c);
}
void mce_intel_feature_clear(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 3f6b137ef4e6..3d4a48336084 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -215,7 +215,6 @@ static unsigned int __verify_patch_size(u8 family, u32 sh_psize, size_t buf_size
default:
WARN(1, "%s: WTF family: 0x%x\n", __func__, family);
return 0;
- break;
}
if (sh_psize > min_t(u32, buf_size, max_size))
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 05ef1f4550cb..f628e3dc150f 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -366,9 +366,38 @@ static void __init ms_hyperv_init_platform(void)
#endif
}
+static bool __init ms_hyperv_x2apic_available(void)
+{
+ return x2apic_supported();
+}
+
+/*
+ * If ms_hyperv_msi_ext_dest_id() returns true, hyperv_prepare_irq_remapping()
+ * returns -ENODEV and the Hyper-V IOMMU driver is not used; instead, the
+ * generic support of the 15-bit APIC ID is used: see __irq_msi_compose_msg().
+ *
+ * Note: for a VM on Hyper-V, the I/O-APIC is the only device which
+ * (logically) generates MSIs directly to the system APIC irq domain.
+ * There is no HPET, and PCI MSI/MSI-X interrupts are remapped by the
+ * pci-hyperv host bridge.
+ */
+static bool __init ms_hyperv_msi_ext_dest_id(void)
+{
+ u32 eax;
+
+ eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_INTERFACE);
+ if (eax != HYPERV_VS_INTERFACE_EAX_SIGNATURE)
+ return false;
+
+ eax = cpuid_eax(HYPERV_CPUID_VIRT_STACK_PROPERTIES);
+ return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE;
+}
+
const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
.name = "Microsoft Hyper-V",
.detect = ms_hyperv_platform,
.type = X86_HYPER_MS_HYPERV,
+ .init.x2apic_available = ms_hyperv_x2apic_available,
+ .init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id,
.init.init_platform = ms_hyperv_init_platform,
};
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 6a80f36b5d59..61eb26edc6d2 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -794,8 +794,6 @@ void mtrr_ap_init(void)
if (!use_intel() || mtrr_aps_delayed_init)
return;
- rcu_cpu_starting(smp_processor_id());
-
/*
* Ideally we should hold mtrr_mutex here to avoid mtrr entries
* changed, but this routine will be called in cpu boot time,
@@ -813,7 +811,8 @@ void mtrr_ap_init(void)
}
/**
- * Save current fixed-range MTRR state of the first cpu in cpu_online_mask.
+ * mtrr_save_state - Save current fixed-range MTRR state of the first
+ * cpu in cpu_online_mask.
*/
void mtrr_save_state(void)
{
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index e8b5f1cf1ae8..698bb26aeb6e 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -895,6 +895,10 @@ static __init void __check_quirks_intel(void)
set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
else
set_rdt_options("!l3cat");
+ fallthrough;
+ case INTEL_FAM6_BROADWELL_X:
+ intel_rdt_mbm_apply_quirk();
+ break;
}
}
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index f65d3c0dbc41..ee71c47844cb 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -264,7 +264,7 @@ void __exit rdtgroup_exit(void);
struct rftype {
char *name;
umode_t mode;
- struct kernfs_ops *kf_ops;
+ const struct kernfs_ops *kf_ops;
unsigned long flags;
unsigned long fflags;
@@ -619,6 +619,7 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
void mbm_setup_overflow_handler(struct rdt_domain *dom,
unsigned long delay_ms);
void mbm_handle_overflow(struct work_struct *work);
+void __init intel_rdt_mbm_apply_quirk(void);
bool is_mba_sc(struct rdt_resource *r);
void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm);
u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index a98519a3a2e6..7ac31210e452 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -64,6 +64,69 @@ unsigned int rdt_mon_features;
*/
unsigned int resctrl_cqm_threshold;
+#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
+
+/*
+ * The correction factor table is documented in Documentation/x86/resctrl.rst.
+ * If rmid > rmid threshold, MBM total and local values should be multiplied
+ * by the correction factor.
+ *
+ * The original table is modified for better code:
+ *
+ * 1. The threshold 0 is changed to rmid count - 1 so don't do correction
+ * for the case.
+ * 2. MBM total and local correction table indexed by core counter which is
+ * equal to (x86_cache_max_rmid + 1) / 8 - 1 and is from 0 up to 27.
+ * 3. The correction factor is normalized to 2^20 (1048576) so it's faster
+ * to calculate corrected value by shifting:
+ * corrected_value = (original_value * correction_factor) >> 20
+ */
+static const struct mbm_correction_factor_table {
+ u32 rmidthreshold;
+ u64 cf;
+} mbm_cf_table[] __initdata = {
+ {7, CF(1.000000)},
+ {15, CF(1.000000)},
+ {15, CF(0.969650)},
+ {31, CF(1.000000)},
+ {31, CF(1.066667)},
+ {31, CF(0.969650)},
+ {47, CF(1.142857)},
+ {63, CF(1.000000)},
+ {63, CF(1.185115)},
+ {63, CF(1.066553)},
+ {79, CF(1.454545)},
+ {95, CF(1.000000)},
+ {95, CF(1.230769)},
+ {95, CF(1.142857)},
+ {95, CF(1.066667)},
+ {127, CF(1.000000)},
+ {127, CF(1.254863)},
+ {127, CF(1.185255)},
+ {151, CF(1.000000)},
+ {127, CF(1.066667)},
+ {167, CF(1.000000)},
+ {159, CF(1.454334)},
+ {183, CF(1.000000)},
+ {127, CF(0.969744)},
+ {191, CF(1.280246)},
+ {191, CF(1.230921)},
+ {215, CF(1.000000)},
+ {191, CF(1.143118)},
+};
+
+static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX;
+static u64 mbm_cf __read_mostly;
+
+static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val)
+{
+ /* Correct MBM value. */
+ if (rmid > mbm_cf_rmidthreshold)
+ val = (val * mbm_cf) >> 20;
+
+ return val;
+}
+
static inline struct rmid_entry *__rmid_entry(u32 rmid)
{
struct rmid_entry *entry;
@@ -260,7 +323,8 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
m->chunks += chunks;
m->prev_msr = tval;
- rr->val += m->chunks;
+ rr->val += get_corrected_mbm_count(rmid, m->chunks);
+
return 0;
}
@@ -279,7 +343,7 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
return;
chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width);
- cur_bw = (chunks * r->mon_scale) >> 20;
+ cur_bw = (get_corrected_mbm_count(rmid, chunks) * r->mon_scale) >> 20;
if (m->delta_comp)
m->delta_bw = abs(cur_bw - m->prev_bw);
@@ -642,3 +706,17 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
return 0;
}
+
+void __init intel_rdt_mbm_apply_quirk(void)
+{
+ int cf_index;
+
+ cf_index = (boot_cpu_data.x86_cache_max_rmid + 1) / 8 - 1;
+ if (cf_index >= ARRAY_SIZE(mbm_cf_table)) {
+ pr_info("No MBM correction factor available\n");
+ return;
+ }
+
+ mbm_cf_rmidthreshold = mbm_cf_table[cf_index].rmidthreshold;
+ mbm_cf = mbm_cf_table[cf_index].cf;
+}
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 0daf2f1cf7a8..e916646adc69 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -1458,7 +1458,7 @@ static int pseudo_lock_dev_release(struct inode *inode, struct file *filp)
return 0;
}
-static int pseudo_lock_dev_mremap(struct vm_area_struct *area)
+static int pseudo_lock_dev_mremap(struct vm_area_struct *area, unsigned long flags)
{
/* Not supported */
return -EINVAL;
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index f3418428682b..29ffb95b25ff 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -240,13 +240,13 @@ static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
return -EINVAL;
}
-static struct kernfs_ops rdtgroup_kf_single_ops = {
+static const struct kernfs_ops rdtgroup_kf_single_ops = {
.atomic_write_len = PAGE_SIZE,
.write = rdtgroup_file_write,
.seq_show = rdtgroup_seqfile_show,
};
-static struct kernfs_ops kf_mondata_ops = {
+static const struct kernfs_ops kf_mondata_ops = {
.atomic_write_len = PAGE_SIZE,
.seq_show = rdtgroup_mondata_show,
};
@@ -3023,8 +3023,7 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
return -EPERM;
}
-static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
- cpumask_var_t tmpmask)
+static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
{
struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
int cpu;
@@ -3056,8 +3055,7 @@ static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
return 0;
}
-static int rdtgroup_ctrl_remove(struct kernfs_node *kn,
- struct rdtgroup *rdtgrp)
+static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
{
rdtgrp->flags = RDT_DELETED;
list_del(&rdtgrp->rdtgroup_list);
@@ -3066,8 +3064,7 @@ static int rdtgroup_ctrl_remove(struct kernfs_node *kn,
return 0;
}
-static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
- cpumask_var_t tmpmask)
+static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
{
int cpu;
@@ -3094,7 +3091,7 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
closid_free(rdtgrp->closid);
free_rmid(rdtgrp->mon.rmid);
- rdtgroup_ctrl_remove(kn, rdtgrp);
+ rdtgroup_ctrl_remove(rdtgrp);
/*
* Free all the child monitor group rmids.
@@ -3131,13 +3128,13 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
rdtgrp != &rdtgroup_default) {
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
- ret = rdtgroup_ctrl_remove(kn, rdtgrp);
+ ret = rdtgroup_ctrl_remove(rdtgrp);
} else {
- ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
+ ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
}
} else if (rdtgrp->type == RDTMON_GROUP &&
is_mon_groups(parent_kn, kn->name)) {
- ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
+ ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
} else {
ret = -EPERM;
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 866c9a9bcdee..236924930bf0 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -44,6 +44,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 },
{ X86_FEATURE_SEV_ES, CPUID_EAX, 3, 0x8000001f, 0 },
{ X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 },
+ { X86_FEATURE_VM_PAGE_FLUSH, CPUID_EAX, 2, 0x8000001f, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
new file mode 100644
index 000000000000..91d3dc784a29
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/Makefile
@@ -0,0 +1,5 @@
+obj-y += \
+ driver.o \
+ encl.o \
+ ioctl.o \
+ main.o
diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/kernel/cpu/sgx/arch.h
new file mode 100644
index 000000000000..dd7602c44c72
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/arch.h
@@ -0,0 +1,338 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/**
+ * Copyright(c) 2016-20 Intel Corporation.
+ *
+ * Contains data structures defined by the SGX architecture. Data structures
+ * defined by the Linux software stack should not be placed here.
+ */
+#ifndef _ASM_X86_SGX_ARCH_H
+#define _ASM_X86_SGX_ARCH_H
+
+#include <linux/bits.h>
+#include <linux/types.h>
+
+/* The SGX specific CPUID function. */
+#define SGX_CPUID 0x12
+/* EPC enumeration. */
+#define SGX_CPUID_EPC 2
+/* An invalid EPC section, i.e. the end marker. */
+#define SGX_CPUID_EPC_INVALID 0x0
+/* A valid EPC section. */
+#define SGX_CPUID_EPC_SECTION 0x1
+/* The bitmask for the EPC section type. */
+#define SGX_CPUID_EPC_MASK GENMASK(3, 0)
+
+/**
+ * enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
+ * %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not
+ * been completed yet.
+ * %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's
+ * public key does not match IA32_SGXLEPUBKEYHASH.
+ * %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received
+ */
+enum sgx_return_code {
+ SGX_NOT_TRACKED = 11,
+ SGX_INVALID_EINITTOKEN = 16,
+ SGX_UNMASKED_EVENT = 128,
+};
+
+/* The modulus size for 3072-bit RSA keys. */
+#define SGX_MODULUS_SIZE 384
+
+/**
+ * enum sgx_miscselect - additional information to an SSA frame
+ * %SGX_MISC_EXINFO: Report #PF or #GP to the SSA frame.
+ *
+ * Save State Area (SSA) is a stack inside the enclave used to store processor
+ * state when an exception or interrupt occurs. This enum defines additional
+ * information stored to an SSA frame.
+ */
+enum sgx_miscselect {
+ SGX_MISC_EXINFO = BIT(0),
+};
+
+#define SGX_MISC_RESERVED_MASK GENMASK_ULL(63, 1)
+
+#define SGX_SSA_GPRS_SIZE 184
+#define SGX_SSA_MISC_EXINFO_SIZE 16
+
+/**
+ * enum sgx_attributes - the attributes field in &struct sgx_secs
+ * %SGX_ATTR_INIT: Enclave can be entered (is initialized).
+ * %SGX_ATTR_DEBUG: Allow ENCLS(EDBGRD) and ENCLS(EDBGWR).
+ * %SGX_ATTR_MODE64BIT: Tell that this a 64-bit enclave.
+ * %SGX_ATTR_PROVISIONKEY: Allow to use provisioning keys for remote
+ * attestation.
+ * %SGX_ATTR_KSS: Allow to use key separation and sharing (KSS).
+ * %SGX_ATTR_EINITTOKENKEY: Allow to use token signing key that is used to
+ * sign cryptographic tokens that can be passed to
+ * EINIT as an authorization to run an enclave.
+ */
+enum sgx_attribute {
+ SGX_ATTR_INIT = BIT(0),
+ SGX_ATTR_DEBUG = BIT(1),
+ SGX_ATTR_MODE64BIT = BIT(2),
+ SGX_ATTR_PROVISIONKEY = BIT(4),
+ SGX_ATTR_EINITTOKENKEY = BIT(5),
+ SGX_ATTR_KSS = BIT(7),
+};
+
+#define SGX_ATTR_RESERVED_MASK (BIT_ULL(3) | BIT_ULL(6) | GENMASK_ULL(63, 8))
+
+/**
+ * struct sgx_secs - SGX Enclave Control Structure (SECS)
+ * @size: size of the address space
+ * @base: base address of the address space
+ * @ssa_frame_size: size of an SSA frame
+ * @miscselect: additional information stored to an SSA frame
+ * @attributes: attributes for enclave
+ * @xfrm: XSave-Feature Request Mask (subset of XCR0)
+ * @mrenclave: SHA256-hash of the enclave contents
+ * @mrsigner: SHA256-hash of the public key used to sign the SIGSTRUCT
+ * @config_id: a user-defined value that is used in key derivation
+ * @isv_prod_id: a user-defined value that is used in key derivation
+ * @isv_svn: a user-defined value that is used in key derivation
+ * @config_svn: a user-defined value that is used in key derivation
+ *
+ * SGX Enclave Control Structure (SECS) is a special enclave page that is not
+ * visible in the address space. In fact, this structure defines the address
+ * range and other global attributes for the enclave and it is the first EPC
+ * page created for any enclave. It is moved from a temporary buffer to an EPC
+ * by the means of ENCLS[ECREATE] function.
+ */
+struct sgx_secs {
+ u64 size;
+ u64 base;
+ u32 ssa_frame_size;
+ u32 miscselect;
+ u8 reserved1[24];
+ u64 attributes;
+ u64 xfrm;
+ u32 mrenclave[8];
+ u8 reserved2[32];
+ u32 mrsigner[8];
+ u8 reserved3[32];
+ u32 config_id[16];
+ u16 isv_prod_id;
+ u16 isv_svn;
+ u16 config_svn;
+ u8 reserved4[3834];
+} __packed;
+
+/**
+ * enum sgx_tcs_flags - execution flags for TCS
+ * %SGX_TCS_DBGOPTIN: If enabled allows single-stepping and breakpoints
+ * inside an enclave. It is cleared by EADD but can
+ * be set later with EDBGWR.
+ */
+enum sgx_tcs_flags {
+ SGX_TCS_DBGOPTIN = 0x01,
+};
+
+#define SGX_TCS_RESERVED_MASK GENMASK_ULL(63, 1)
+#define SGX_TCS_RESERVED_SIZE 4024
+
+/**
+ * struct sgx_tcs - Thread Control Structure (TCS)
+ * @state: used to mark an entered TCS
+ * @flags: execution flags (cleared by EADD)
+ * @ssa_offset: SSA stack offset relative to the enclave base
+ * @ssa_index: the current SSA frame index (cleard by EADD)
+ * @nr_ssa_frames: the number of frame in the SSA stack
+ * @entry_offset: entry point offset relative to the enclave base
+ * @exit_addr: address outside the enclave to exit on an exception or
+ * interrupt
+ * @fs_offset: offset relative to the enclave base to become FS
+ * segment inside the enclave
+ * @gs_offset: offset relative to the enclave base to become GS
+ * segment inside the enclave
+ * @fs_limit: size to become a new FS-limit (only 32-bit enclaves)
+ * @gs_limit: size to become a new GS-limit (only 32-bit enclaves)
+ *
+ * Thread Control Structure (TCS) is an enclave page visible in its address
+ * space that defines an entry point inside the enclave. A thread enters inside
+ * an enclave by supplying address of TCS to ENCLU(EENTER). A TCS can be entered
+ * by only one thread at a time.
+ */
+struct sgx_tcs {
+ u64 state;
+ u64 flags;
+ u64 ssa_offset;
+ u32 ssa_index;
+ u32 nr_ssa_frames;
+ u64 entry_offset;
+ u64 exit_addr;
+ u64 fs_offset;
+ u64 gs_offset;
+ u32 fs_limit;
+ u32 gs_limit;
+ u8 reserved[SGX_TCS_RESERVED_SIZE];
+} __packed;
+
+/**
+ * struct sgx_pageinfo - an enclave page descriptor
+ * @addr: address of the enclave page
+ * @contents: pointer to the page contents
+ * @metadata: pointer either to a SECINFO or PCMD instance
+ * @secs: address of the SECS page
+ */
+struct sgx_pageinfo {
+ u64 addr;
+ u64 contents;
+ u64 metadata;
+ u64 secs;
+} __packed __aligned(32);
+
+
+/**
+ * enum sgx_page_type - bits in the SECINFO flags defining the page type
+ * %SGX_PAGE_TYPE_SECS: a SECS page
+ * %SGX_PAGE_TYPE_TCS: a TCS page
+ * %SGX_PAGE_TYPE_REG: a regular page
+ * %SGX_PAGE_TYPE_VA: a VA page
+ * %SGX_PAGE_TYPE_TRIM: a page in trimmed state
+ */
+enum sgx_page_type {
+ SGX_PAGE_TYPE_SECS,
+ SGX_PAGE_TYPE_TCS,
+ SGX_PAGE_TYPE_REG,
+ SGX_PAGE_TYPE_VA,
+ SGX_PAGE_TYPE_TRIM,
+};
+
+#define SGX_NR_PAGE_TYPES 5
+#define SGX_PAGE_TYPE_MASK GENMASK(7, 0)
+
+/**
+ * enum sgx_secinfo_flags - the flags field in &struct sgx_secinfo
+ * %SGX_SECINFO_R: allow read
+ * %SGX_SECINFO_W: allow write
+ * %SGX_SECINFO_X: allow execution
+ * %SGX_SECINFO_SECS: a SECS page
+ * %SGX_SECINFO_TCS: a TCS page
+ * %SGX_SECINFO_REG: a regular page
+ * %SGX_SECINFO_VA: a VA page
+ * %SGX_SECINFO_TRIM: a page in trimmed state
+ */
+enum sgx_secinfo_flags {
+ SGX_SECINFO_R = BIT(0),
+ SGX_SECINFO_W = BIT(1),
+ SGX_SECINFO_X = BIT(2),
+ SGX_SECINFO_SECS = (SGX_PAGE_TYPE_SECS << 8),
+ SGX_SECINFO_TCS = (SGX_PAGE_TYPE_TCS << 8),
+ SGX_SECINFO_REG = (SGX_PAGE_TYPE_REG << 8),
+ SGX_SECINFO_VA = (SGX_PAGE_TYPE_VA << 8),
+ SGX_SECINFO_TRIM = (SGX_PAGE_TYPE_TRIM << 8),
+};
+
+#define SGX_SECINFO_PERMISSION_MASK GENMASK_ULL(2, 0)
+#define SGX_SECINFO_PAGE_TYPE_MASK (SGX_PAGE_TYPE_MASK << 8)
+#define SGX_SECINFO_RESERVED_MASK ~(SGX_SECINFO_PERMISSION_MASK | \
+ SGX_SECINFO_PAGE_TYPE_MASK)
+
+/**
+ * struct sgx_secinfo - describes attributes of an EPC page
+ * @flags: permissions and type
+ *
+ * Used together with ENCLS leaves that add or modify an EPC page to an
+ * enclave to define page permissions and type.
+ */
+struct sgx_secinfo {
+ u64 flags;
+ u8 reserved[56];
+} __packed __aligned(64);
+
+#define SGX_PCMD_RESERVED_SIZE 40
+
+/**
+ * struct sgx_pcmd - Paging Crypto Metadata (PCMD)
+ * @enclave_id: enclave identifier
+ * @mac: MAC over PCMD, page contents and isvsvn
+ *
+ * PCMD is stored for every swapped page to the regular memory. When ELDU loads
+ * the page back it recalculates the MAC by using a isvsvn number stored in a
+ * VA page. Together these two structures bring integrity and rollback
+ * protection.
+ */
+struct sgx_pcmd {
+ struct sgx_secinfo secinfo;
+ u64 enclave_id;
+ u8 reserved[SGX_PCMD_RESERVED_SIZE];
+ u8 mac[16];
+} __packed __aligned(128);
+
+#define SGX_SIGSTRUCT_RESERVED1_SIZE 84
+#define SGX_SIGSTRUCT_RESERVED2_SIZE 20
+#define SGX_SIGSTRUCT_RESERVED3_SIZE 32
+#define SGX_SIGSTRUCT_RESERVED4_SIZE 12
+
+/**
+ * struct sgx_sigstruct_header - defines author of the enclave
+ * @header1: constant byte string
+ * @vendor: must be either 0x0000 or 0x8086
+ * @date: YYYYMMDD in BCD
+ * @header2: costant byte string
+ * @swdefined: software defined value
+ */
+struct sgx_sigstruct_header {
+ u64 header1[2];
+ u32 vendor;
+ u32 date;
+ u64 header2[2];
+ u32 swdefined;
+ u8 reserved1[84];
+} __packed;
+
+/**
+ * struct sgx_sigstruct_body - defines contents of the enclave
+ * @miscselect: additional information stored to an SSA frame
+ * @misc_mask: required miscselect in SECS
+ * @attributes: attributes for enclave
+ * @xfrm: XSave-Feature Request Mask (subset of XCR0)
+ * @attributes_mask: required attributes in SECS
+ * @xfrm_mask: required XFRM in SECS
+ * @mrenclave: SHA256-hash of the enclave contents
+ * @isvprodid: a user-defined value that is used in key derivation
+ * @isvsvn: a user-defined value that is used in key derivation
+ */
+struct sgx_sigstruct_body {
+ u32 miscselect;
+ u32 misc_mask;
+ u8 reserved2[20];
+ u64 attributes;
+ u64 xfrm;
+ u64 attributes_mask;
+ u64 xfrm_mask;
+ u8 mrenclave[32];
+ u8 reserved3[32];
+ u16 isvprodid;
+ u16 isvsvn;
+} __packed;
+
+/**
+ * struct sgx_sigstruct - an enclave signature
+ * @header: defines author of the enclave
+ * @modulus: the modulus of the public key
+ * @exponent: the exponent of the public key
+ * @signature: the signature calculated over the fields except modulus,
+ * @body: defines contents of the enclave
+ * @q1: a value used in RSA signature verification
+ * @q2: a value used in RSA signature verification
+ *
+ * Header and body are the parts that are actual signed. The remaining fields
+ * define the signature of the enclave.
+ */
+struct sgx_sigstruct {
+ struct sgx_sigstruct_header header;
+ u8 modulus[SGX_MODULUS_SIZE];
+ u32 exponent;
+ u8 signature[SGX_MODULUS_SIZE];
+ struct sgx_sigstruct_body body;
+ u8 reserved4[12];
+ u8 q1[SGX_MODULUS_SIZE];
+ u8 q2[SGX_MODULUS_SIZE];
+} __packed;
+
+#define SGX_LAUNCH_TOKEN_SIZE 304
+
+#endif /* _ASM_X86_SGX_ARCH_H */
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
new file mode 100644
index 000000000000..f2eac41bb4ff
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <linux/acpi.h>
+#include <linux/miscdevice.h>
+#include <linux/mman.h>
+#include <linux/security.h>
+#include <linux/suspend.h>
+#include <asm/traps.h>
+#include "driver.h"
+#include "encl.h"
+
+u64 sgx_attributes_reserved_mask;
+u64 sgx_xfrm_reserved_mask = ~0x3;
+u32 sgx_misc_reserved_mask;
+
+static int sgx_open(struct inode *inode, struct file *file)
+{
+ struct sgx_encl *encl;
+ int ret;
+
+ encl = kzalloc(sizeof(*encl), GFP_KERNEL);
+ if (!encl)
+ return -ENOMEM;
+
+ kref_init(&encl->refcount);
+ xa_init(&encl->page_array);
+ mutex_init(&encl->lock);
+ INIT_LIST_HEAD(&encl->va_pages);
+ INIT_LIST_HEAD(&encl->mm_list);
+ spin_lock_init(&encl->mm_lock);
+
+ ret = init_srcu_struct(&encl->srcu);
+ if (ret) {
+ kfree(encl);
+ return ret;
+ }
+
+ file->private_data = encl;
+
+ return 0;
+}
+
+static int sgx_release(struct inode *inode, struct file *file)
+{
+ struct sgx_encl *encl = file->private_data;
+ struct sgx_encl_mm *encl_mm;
+
+ /*
+ * Drain the remaining mm_list entries. At this point the list contains
+ * entries for processes, which have closed the enclave file but have
+ * not exited yet. The processes, which have exited, are gone from the
+ * list by sgx_mmu_notifier_release().
+ */
+ for ( ; ; ) {
+ spin_lock(&encl->mm_lock);
+
+ if (list_empty(&encl->mm_list)) {
+ encl_mm = NULL;
+ } else {
+ encl_mm = list_first_entry(&encl->mm_list,
+ struct sgx_encl_mm, list);
+ list_del_rcu(&encl_mm->list);
+ }
+
+ spin_unlock(&encl->mm_lock);
+
+ /* The enclave is no longer mapped by any mm. */
+ if (!encl_mm)
+ break;
+
+ synchronize_srcu(&encl->srcu);
+ mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm);
+ kfree(encl_mm);
+ }
+
+ kref_put(&encl->refcount, sgx_encl_release);
+ return 0;
+}
+
+static int sgx_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct sgx_encl *encl = file->private_data;
+ int ret;
+
+ ret = sgx_encl_may_map(encl, vma->vm_start, vma->vm_end, vma->vm_flags);
+ if (ret)
+ return ret;
+
+ ret = sgx_encl_mm_add(encl, vma->vm_mm);
+ if (ret)
+ return ret;
+
+ vma->vm_ops = &sgx_vm_ops;
+ vma->vm_flags |= VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP | VM_IO;
+ vma->vm_private_data = encl;
+
+ return 0;
+}
+
+static unsigned long sgx_get_unmapped_area(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long pgoff,
+ unsigned long flags)
+{
+ if ((flags & MAP_TYPE) == MAP_PRIVATE)
+ return -EINVAL;
+
+ if (flags & MAP_FIXED)
+ return addr;
+
+ return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+}
+
+#ifdef CONFIG_COMPAT
+static long sgx_compat_ioctl(struct file *filep, unsigned int cmd,
+ unsigned long arg)
+{
+ return sgx_ioctl(filep, cmd, arg);
+}
+#endif
+
+static const struct file_operations sgx_encl_fops = {
+ .owner = THIS_MODULE,
+ .open = sgx_open,
+ .release = sgx_release,
+ .unlocked_ioctl = sgx_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = sgx_compat_ioctl,
+#endif
+ .mmap = sgx_mmap,
+ .get_unmapped_area = sgx_get_unmapped_area,
+};
+
+const struct file_operations sgx_provision_fops = {
+ .owner = THIS_MODULE,
+};
+
+static struct miscdevice sgx_dev_enclave = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_enclave",
+ .nodename = "sgx_enclave",
+ .fops = &sgx_encl_fops,
+};
+
+static struct miscdevice sgx_dev_provision = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_provision",
+ .nodename = "sgx_provision",
+ .fops = &sgx_provision_fops,
+};
+
+int __init sgx_drv_init(void)
+{
+ unsigned int eax, ebx, ecx, edx;
+ u64 attr_mask;
+ u64 xfrm_mask;
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_LC))
+ return -ENODEV;
+
+ cpuid_count(SGX_CPUID, 0, &eax, &ebx, &ecx, &edx);
+
+ if (!(eax & 1)) {
+ pr_err("SGX disabled: SGX1 instruction support not available.\n");
+ return -ENODEV;
+ }
+
+ sgx_misc_reserved_mask = ~ebx | SGX_MISC_RESERVED_MASK;
+
+ cpuid_count(SGX_CPUID, 1, &eax, &ebx, &ecx, &edx);
+
+ attr_mask = (((u64)ebx) << 32) + (u64)eax;
+ sgx_attributes_reserved_mask = ~attr_mask | SGX_ATTR_RESERVED_MASK;
+
+ if (cpu_feature_enabled(X86_FEATURE_OSXSAVE)) {
+ xfrm_mask = (((u64)edx) << 32) + (u64)ecx;
+ sgx_xfrm_reserved_mask = ~xfrm_mask;
+ }
+
+ ret = misc_register(&sgx_dev_enclave);
+ if (ret)
+ return ret;
+
+ ret = misc_register(&sgx_dev_provision);
+ if (ret) {
+ misc_deregister(&sgx_dev_enclave);
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h
new file mode 100644
index 000000000000..4eddb4d571ef
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/driver.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ARCH_SGX_DRIVER_H__
+#define __ARCH_SGX_DRIVER_H__
+
+#include <crypto/hash.h>
+#include <linux/kref.h>
+#include <linux/mmu_notifier.h>
+#include <linux/radix-tree.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <uapi/asm/sgx.h>
+#include "sgx.h"
+
+#define SGX_EINIT_SPIN_COUNT 20
+#define SGX_EINIT_SLEEP_COUNT 50
+#define SGX_EINIT_SLEEP_TIME 20
+
+extern u64 sgx_attributes_reserved_mask;
+extern u64 sgx_xfrm_reserved_mask;
+extern u32 sgx_misc_reserved_mask;
+
+extern const struct file_operations sgx_provision_fops;
+
+long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
+
+int sgx_drv_init(void);
+
+#endif /* __ARCH_X86_SGX_DRIVER_H__ */
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
new file mode 100644
index 000000000000..ee50a5010277
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -0,0 +1,740 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <linux/lockdep.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/shmem_fs.h>
+#include <linux/suspend.h>
+#include <linux/sched/mm.h>
+#include "arch.h"
+#include "encl.h"
+#include "encls.h"
+#include "sgx.h"
+
+/*
+ * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
+ * Pages" in the SDM.
+ */
+static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
+ struct sgx_epc_page *epc_page,
+ struct sgx_epc_page *secs_page)
+{
+ unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
+ struct sgx_encl *encl = encl_page->encl;
+ struct sgx_pageinfo pginfo;
+ struct sgx_backing b;
+ pgoff_t page_index;
+ int ret;
+
+ if (secs_page)
+ page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
+ else
+ page_index = PFN_DOWN(encl->size);
+
+ ret = sgx_encl_get_backing(encl, page_index, &b);
+ if (ret)
+ return ret;
+
+ pginfo.addr = encl_page->desc & PAGE_MASK;
+ pginfo.contents = (unsigned long)kmap_atomic(b.contents);
+ pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) +
+ b.pcmd_offset;
+
+ if (secs_page)
+ pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
+ else
+ pginfo.secs = 0;
+
+ ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page),
+ sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset);
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "ELDU");
+
+ ret = -EFAULT;
+ }
+
+ kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset));
+ kunmap_atomic((void *)(unsigned long)pginfo.contents);
+
+ sgx_encl_put_backing(&b, false);
+
+ return ret;
+}
+
+static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
+ struct sgx_epc_page *secs_page)
+{
+
+ unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
+ struct sgx_encl *encl = encl_page->encl;
+ struct sgx_epc_page *epc_page;
+ int ret;
+
+ epc_page = sgx_alloc_epc_page(encl_page, false);
+ if (IS_ERR(epc_page))
+ return epc_page;
+
+ ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
+ if (ret) {
+ sgx_free_epc_page(epc_page);
+ return ERR_PTR(ret);
+ }
+
+ sgx_free_va_slot(encl_page->va_page, va_offset);
+ list_move(&encl_page->va_page->list, &encl->va_pages);
+ encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK;
+ encl_page->epc_page = epc_page;
+
+ return epc_page;
+}
+
+static struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
+ unsigned long addr,
+ unsigned long vm_flags)
+{
+ unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
+ struct sgx_epc_page *epc_page;
+ struct sgx_encl_page *entry;
+
+ entry = xa_load(&encl->page_array, PFN_DOWN(addr));
+ if (!entry)
+ return ERR_PTR(-EFAULT);
+
+ /*
+ * Verify that the faulted page has equal or higher build time
+ * permissions than the VMA permissions (i.e. the subset of {VM_READ,
+ * VM_WRITE, VM_EXECUTE} in vma->vm_flags).
+ */
+ if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits)
+ return ERR_PTR(-EFAULT);
+
+ /* Entry successfully located. */
+ if (entry->epc_page) {
+ if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)
+ return ERR_PTR(-EBUSY);
+
+ return entry;
+ }
+
+ if (!(encl->secs.epc_page)) {
+ epc_page = sgx_encl_eldu(&encl->secs, NULL);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
+ }
+
+ epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
+
+ encl->secs_child_cnt++;
+ sgx_mark_page_reclaimable(entry->epc_page);
+
+ return entry;
+}
+
+static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
+{
+ unsigned long addr = (unsigned long)vmf->address;
+ struct vm_area_struct *vma = vmf->vma;
+ struct sgx_encl_page *entry;
+ unsigned long phys_addr;
+ struct sgx_encl *encl;
+ unsigned long pfn;
+ vm_fault_t ret;
+
+ encl = vma->vm_private_data;
+
+ /*
+ * It's very unlikely but possible that allocating memory for the
+ * mm_list entry of a forked process failed in sgx_vma_open(). When
+ * this happens, vm_private_data is set to NULL.
+ */
+ if (unlikely(!encl))
+ return VM_FAULT_SIGBUS;
+
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page(encl, addr, vma->vm_flags);
+ if (IS_ERR(entry)) {
+ mutex_unlock(&encl->lock);
+
+ if (PTR_ERR(entry) == -EBUSY)
+ return VM_FAULT_NOPAGE;
+
+ return VM_FAULT_SIGBUS;
+ }
+
+ phys_addr = sgx_get_epc_phys_addr(entry->epc_page);
+
+ /* Check if another thread got here first to insert the PTE. */
+ if (!follow_pfn(vma, addr, &pfn)) {
+ mutex_unlock(&encl->lock);
+
+ return VM_FAULT_NOPAGE;
+ }
+
+ ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
+ if (ret != VM_FAULT_NOPAGE) {
+ mutex_unlock(&encl->lock);
+
+ return VM_FAULT_SIGBUS;
+ }
+
+ sgx_encl_test_and_clear_young(vma->vm_mm, entry);
+ mutex_unlock(&encl->lock);
+
+ return VM_FAULT_NOPAGE;
+}
+
+static void sgx_vma_open(struct vm_area_struct *vma)
+{
+ struct sgx_encl *encl = vma->vm_private_data;
+
+ /*
+ * It's possible but unlikely that vm_private_data is NULL. This can
+ * happen in a grandchild of a process, when sgx_encl_mm_add() had
+ * failed to allocate memory in this callback.
+ */
+ if (unlikely(!encl))
+ return;
+
+ if (sgx_encl_mm_add(encl, vma->vm_mm))
+ vma->vm_private_data = NULL;
+}
+
+
+/**
+ * sgx_encl_may_map() - Check if a requested VMA mapping is allowed
+ * @encl: an enclave pointer
+ * @start: lower bound of the address range, inclusive
+ * @end: upper bound of the address range, exclusive
+ * @vm_flags: VMA flags
+ *
+ * Iterate through the enclave pages contained within [@start, @end) to verify
+ * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC}
+ * do not contain any permissions that are not contained in the build time
+ * permissions of any of the enclave pages within the given address range.
+ *
+ * An enclave creator must declare the strongest permissions that will be
+ * needed for each enclave page. This ensures that mappings have the identical
+ * or weaker permissions than the earlier declared permissions.
+ *
+ * Return: 0 on success, -EACCES otherwise
+ */
+int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
+ unsigned long end, unsigned long vm_flags)
+{
+ unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
+ struct sgx_encl_page *page;
+ unsigned long count = 0;
+ int ret = 0;
+
+ XA_STATE(xas, &encl->page_array, PFN_DOWN(start));
+
+ /*
+ * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might
+ * conflict with the enclave page permissions.
+ */
+ if (current->personality & READ_IMPLIES_EXEC)
+ return -EACCES;
+
+ mutex_lock(&encl->lock);
+ xas_lock(&xas);
+ xas_for_each(&xas, page, PFN_DOWN(end - 1)) {
+ if (~page->vm_max_prot_bits & vm_prot_bits) {
+ ret = -EACCES;
+ break;
+ }
+
+ /* Reschedule on every XA_CHECK_SCHED iteration. */
+ if (!(++count % XA_CHECK_SCHED)) {
+ xas_pause(&xas);
+ xas_unlock(&xas);
+ mutex_unlock(&encl->lock);
+
+ cond_resched();
+
+ mutex_lock(&encl->lock);
+ xas_lock(&xas);
+ }
+ }
+ xas_unlock(&xas);
+ mutex_unlock(&encl->lock);
+
+ return ret;
+}
+
+static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, unsigned long newflags)
+{
+ return sgx_encl_may_map(vma->vm_private_data, start, end, newflags);
+}
+
+static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page,
+ unsigned long addr, void *data)
+{
+ unsigned long offset = addr & ~PAGE_MASK;
+ int ret;
+
+
+ ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
+ if (ret)
+ return -EIO;
+
+ return 0;
+}
+
+static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page,
+ unsigned long addr, void *data)
+{
+ unsigned long offset = addr & ~PAGE_MASK;
+ int ret;
+
+ ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
+ if (ret)
+ return -EIO;
+
+ return 0;
+}
+
+/*
+ * Load an enclave page to EPC if required, and take encl->lock.
+ */
+static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl,
+ unsigned long addr,
+ unsigned long vm_flags)
+{
+ struct sgx_encl_page *entry;
+
+ for ( ; ; ) {
+ mutex_lock(&encl->lock);
+
+ entry = sgx_encl_load_page(encl, addr, vm_flags);
+ if (PTR_ERR(entry) != -EBUSY)
+ break;
+
+ mutex_unlock(&encl->lock);
+ }
+
+ if (IS_ERR(entry))
+ mutex_unlock(&encl->lock);
+
+ return entry;
+}
+
+static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr,
+ void *buf, int len, int write)
+{
+ struct sgx_encl *encl = vma->vm_private_data;
+ struct sgx_encl_page *entry = NULL;
+ char data[sizeof(unsigned long)];
+ unsigned long align;
+ int offset;
+ int cnt;
+ int ret = 0;
+ int i;
+
+ /*
+ * If process was forked, VMA is still there but vm_private_data is set
+ * to NULL.
+ */
+ if (!encl)
+ return -EFAULT;
+
+ if (!test_bit(SGX_ENCL_DEBUG, &encl->flags))
+ return -EFAULT;
+
+ for (i = 0; i < len; i += cnt) {
+ entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK,
+ vma->vm_flags);
+ if (IS_ERR(entry)) {
+ ret = PTR_ERR(entry);
+ break;
+ }
+
+ align = ALIGN_DOWN(addr + i, sizeof(unsigned long));
+ offset = (addr + i) & (sizeof(unsigned long) - 1);
+ cnt = sizeof(unsigned long) - offset;
+ cnt = min(cnt, len - i);
+
+ ret = sgx_encl_debug_read(encl, entry, align, data);
+ if (ret)
+ goto out;
+
+ if (write) {
+ memcpy(data + offset, buf + i, cnt);
+ ret = sgx_encl_debug_write(encl, entry, align, data);
+ if (ret)
+ goto out;
+ } else {
+ memcpy(buf + i, data + offset, cnt);
+ }
+
+out:
+ mutex_unlock(&encl->lock);
+
+ if (ret)
+ break;
+ }
+
+ return ret < 0 ? ret : i;
+}
+
+const struct vm_operations_struct sgx_vm_ops = {
+ .fault = sgx_vma_fault,
+ .mprotect = sgx_vma_mprotect,
+ .open = sgx_vma_open,
+ .access = sgx_vma_access,
+};
+
+/**
+ * sgx_encl_release - Destroy an enclave instance
+ * @kref: address of a kref inside &sgx_encl
+ *
+ * Used together with kref_put(). Frees all the resources associated with the
+ * enclave and the instance itself.
+ */
+void sgx_encl_release(struct kref *ref)
+{
+ struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
+ struct sgx_va_page *va_page;
+ struct sgx_encl_page *entry;
+ unsigned long index;
+
+ xa_for_each(&encl->page_array, index, entry) {
+ if (entry->epc_page) {
+ /*
+ * The page and its radix tree entry cannot be freed
+ * if the page is being held by the reclaimer.
+ */
+ if (sgx_unmark_page_reclaimable(entry->epc_page))
+ continue;
+
+ sgx_free_epc_page(entry->epc_page);
+ encl->secs_child_cnt--;
+ entry->epc_page = NULL;
+ }
+
+ kfree(entry);
+ }
+
+ xa_destroy(&encl->page_array);
+
+ if (!encl->secs_child_cnt && encl->secs.epc_page) {
+ sgx_free_epc_page(encl->secs.epc_page);
+ encl->secs.epc_page = NULL;
+ }
+
+ while (!list_empty(&encl->va_pages)) {
+ va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
+ list);
+ list_del(&va_page->list);
+ sgx_free_epc_page(va_page->epc_page);
+ kfree(va_page);
+ }
+
+ if (encl->backing)
+ fput(encl->backing);
+
+ cleanup_srcu_struct(&encl->srcu);
+
+ WARN_ON_ONCE(!list_empty(&encl->mm_list));
+
+ /* Detect EPC page leak's. */
+ WARN_ON_ONCE(encl->secs_child_cnt);
+ WARN_ON_ONCE(encl->secs.epc_page);
+
+ kfree(encl);
+}
+
+/*
+ * 'mm' is exiting and no longer needs mmu notifications.
+ */
+static void sgx_mmu_notifier_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
+ struct sgx_encl_mm *tmp = NULL;
+
+ /*
+ * The enclave itself can remove encl_mm. Note, objects can't be moved
+ * off an RCU protected list, but deletion is ok.
+ */
+ spin_lock(&encl_mm->encl->mm_lock);
+ list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) {
+ if (tmp == encl_mm) {
+ list_del_rcu(&encl_mm->list);
+ break;
+ }
+ }
+ spin_unlock(&encl_mm->encl->mm_lock);
+
+ if (tmp == encl_mm) {
+ synchronize_srcu(&encl_mm->encl->srcu);
+ mmu_notifier_put(mn);
+ }
+}
+
+static void sgx_mmu_notifier_free(struct mmu_notifier *mn)
+{
+ struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
+
+ kfree(encl_mm);
+}
+
+static const struct mmu_notifier_ops sgx_mmu_notifier_ops = {
+ .release = sgx_mmu_notifier_release,
+ .free_notifier = sgx_mmu_notifier_free,
+};
+
+static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl,
+ struct mm_struct *mm)
+{
+ struct sgx_encl_mm *encl_mm = NULL;
+ struct sgx_encl_mm *tmp;
+ int idx;
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(tmp, &encl->mm_list, list) {
+ if (tmp->mm == mm) {
+ encl_mm = tmp;
+ break;
+ }
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ return encl_mm;
+}
+
+int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
+{
+ struct sgx_encl_mm *encl_mm;
+ int ret;
+
+ /*
+ * Even though a single enclave may be mapped into an mm more than once,
+ * each 'mm' only appears once on encl->mm_list. This is guaranteed by
+ * holding the mm's mmap lock for write before an mm can be added or
+ * remove to an encl->mm_list.
+ */
+ mmap_assert_write_locked(mm);
+
+ /*
+ * It's possible that an entry already exists in the mm_list, because it
+ * is removed only on VFS release or process exit.
+ */
+ if (sgx_encl_find_mm(encl, mm))
+ return 0;
+
+ encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL);
+ if (!encl_mm)
+ return -ENOMEM;
+
+ encl_mm->encl = encl;
+ encl_mm->mm = mm;
+ encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops;
+
+ ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm);
+ if (ret) {
+ kfree(encl_mm);
+ return ret;
+ }
+
+ spin_lock(&encl->mm_lock);
+ list_add_rcu(&encl_mm->list, &encl->mm_list);
+ /* Pairs with smp_rmb() in sgx_reclaimer_block(). */
+ smp_wmb();
+ encl->mm_list_version++;
+ spin_unlock(&encl->mm_lock);
+
+ return 0;
+}
+
+static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
+ pgoff_t index)
+{
+ struct inode *inode = encl->backing->f_path.dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+ gfp_t gfpmask = mapping_gfp_mask(mapping);
+
+ return shmem_read_mapping_page_gfp(mapping, index, gfpmask);
+}
+
+/**
+ * sgx_encl_get_backing() - Pin the backing storage
+ * @encl: an enclave pointer
+ * @page_index: enclave page index
+ * @backing: data for accessing backing storage for the page
+ *
+ * Pin the backing storage pages for storing the encrypted contents and Paging
+ * Crypto MetaData (PCMD) of an enclave page.
+ *
+ * Return:
+ * 0 on success,
+ * -errno otherwise.
+ */
+int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing)
+{
+ pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5);
+ struct page *contents;
+ struct page *pcmd;
+
+ contents = sgx_encl_get_backing_page(encl, page_index);
+ if (IS_ERR(contents))
+ return PTR_ERR(contents);
+
+ pcmd = sgx_encl_get_backing_page(encl, pcmd_index);
+ if (IS_ERR(pcmd)) {
+ put_page(contents);
+ return PTR_ERR(pcmd);
+ }
+
+ backing->page_index = page_index;
+ backing->contents = contents;
+ backing->pcmd = pcmd;
+ backing->pcmd_offset =
+ (page_index & (PAGE_SIZE / sizeof(struct sgx_pcmd) - 1)) *
+ sizeof(struct sgx_pcmd);
+
+ return 0;
+}
+
+/**
+ * sgx_encl_put_backing() - Unpin the backing storage
+ * @backing: data for accessing backing storage for the page
+ * @do_write: mark pages dirty
+ */
+void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write)
+{
+ if (do_write) {
+ set_page_dirty(backing->pcmd);
+ set_page_dirty(backing->contents);
+ }
+
+ put_page(backing->pcmd);
+ put_page(backing->contents);
+}
+
+static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr,
+ void *data)
+{
+ pte_t pte;
+ int ret;
+
+ ret = pte_young(*ptep);
+ if (ret) {
+ pte = pte_mkold(*ptep);
+ set_pte_at((struct mm_struct *)data, addr, ptep, pte);
+ }
+
+ return ret;
+}
+
+/**
+ * sgx_encl_test_and_clear_young() - Test and reset the accessed bit
+ * @mm: mm_struct that is checked
+ * @page: enclave page to be tested for recent access
+ *
+ * Checks the Access (A) bit from the PTE corresponding to the enclave page and
+ * clears it.
+ *
+ * Return: 1 if the page has been recently accessed and 0 if not.
+ */
+int sgx_encl_test_and_clear_young(struct mm_struct *mm,
+ struct sgx_encl_page *page)
+{
+ unsigned long addr = page->desc & PAGE_MASK;
+ struct sgx_encl *encl = page->encl;
+ struct vm_area_struct *vma;
+ int ret;
+
+ ret = sgx_encl_find(mm, addr, &vma);
+ if (ret)
+ return 0;
+
+ if (encl != vma->vm_private_data)
+ return 0;
+
+ ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE,
+ sgx_encl_test_and_clear_young_cb, vma->vm_mm);
+ if (ret < 0)
+ return 0;
+
+ return ret;
+}
+
+/**
+ * sgx_alloc_va_page() - Allocate a Version Array (VA) page
+ *
+ * Allocate a free EPC page and convert it to a Version Array (VA) page.
+ *
+ * Return:
+ * a VA page,
+ * -errno otherwise
+ */
+struct sgx_epc_page *sgx_alloc_va_page(void)
+{
+ struct sgx_epc_page *epc_page;
+ int ret;
+
+ epc_page = sgx_alloc_epc_page(NULL, true);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
+
+ ret = __epa(sgx_get_epc_virt_addr(epc_page));
+ if (ret) {
+ WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
+ sgx_free_epc_page(epc_page);
+ return ERR_PTR(-EFAULT);
+ }
+
+ return epc_page;
+}
+
+/**
+ * sgx_alloc_va_slot - allocate a VA slot
+ * @va_page: a &struct sgx_va_page instance
+ *
+ * Allocates a slot from a &struct sgx_va_page instance.
+ *
+ * Return: offset of the slot inside the VA page
+ */
+unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page)
+{
+ int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
+
+ if (slot < SGX_VA_SLOT_COUNT)
+ set_bit(slot, va_page->slots);
+
+ return slot << 3;
+}
+
+/**
+ * sgx_free_va_slot - free a VA slot
+ * @va_page: a &struct sgx_va_page instance
+ * @offset: offset of the slot inside the VA page
+ *
+ * Frees a slot from a &struct sgx_va_page instance.
+ */
+void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset)
+{
+ clear_bit(offset >> 3, va_page->slots);
+}
+
+/**
+ * sgx_va_page_full - is the VA page full?
+ * @va_page: a &struct sgx_va_page instance
+ *
+ * Return: true if all slots have been taken
+ */
+bool sgx_va_page_full(struct sgx_va_page *va_page)
+{
+ int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
+
+ return slot == SGX_VA_SLOT_COUNT;
+}
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
new file mode 100644
index 000000000000..d8d30ccbef4c
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/**
+ * Copyright(c) 2016-20 Intel Corporation.
+ *
+ * Contains the software defined data structures for enclaves.
+ */
+#ifndef _X86_ENCL_H
+#define _X86_ENCL_H
+
+#include <linux/cpumask.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/mm_types.h>
+#include <linux/mmu_notifier.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/srcu.h>
+#include <linux/workqueue.h>
+#include <linux/xarray.h>
+#include "sgx.h"
+
+/* 'desc' bits holding the offset in the VA (version array) page. */
+#define SGX_ENCL_PAGE_VA_OFFSET_MASK GENMASK_ULL(11, 3)
+
+/* 'desc' bit marking that the page is being reclaimed. */
+#define SGX_ENCL_PAGE_BEING_RECLAIMED BIT(3)
+
+struct sgx_encl_page {
+ unsigned long desc;
+ unsigned long vm_max_prot_bits;
+ struct sgx_epc_page *epc_page;
+ struct sgx_encl *encl;
+ struct sgx_va_page *va_page;
+};
+
+enum sgx_encl_flags {
+ SGX_ENCL_IOCTL = BIT(0),
+ SGX_ENCL_DEBUG = BIT(1),
+ SGX_ENCL_CREATED = BIT(2),
+ SGX_ENCL_INITIALIZED = BIT(3),
+};
+
+struct sgx_encl_mm {
+ struct sgx_encl *encl;
+ struct mm_struct *mm;
+ struct list_head list;
+ struct mmu_notifier mmu_notifier;
+};
+
+struct sgx_encl {
+ unsigned long base;
+ unsigned long size;
+ unsigned long flags;
+ unsigned int page_cnt;
+ unsigned int secs_child_cnt;
+ struct mutex lock;
+ struct xarray page_array;
+ struct sgx_encl_page secs;
+ unsigned long attributes;
+ unsigned long attributes_mask;
+
+ cpumask_t cpumask;
+ struct file *backing;
+ struct kref refcount;
+ struct list_head va_pages;
+ unsigned long mm_list_version;
+ struct list_head mm_list;
+ spinlock_t mm_lock;
+ struct srcu_struct srcu;
+};
+
+#define SGX_VA_SLOT_COUNT 512
+
+struct sgx_va_page {
+ struct sgx_epc_page *epc_page;
+ DECLARE_BITMAP(slots, SGX_VA_SLOT_COUNT);
+ struct list_head list;
+};
+
+struct sgx_backing {
+ pgoff_t page_index;
+ struct page *contents;
+ struct page *pcmd;
+ unsigned long pcmd_offset;
+};
+
+extern const struct vm_operations_struct sgx_vm_ops;
+
+static inline int sgx_encl_find(struct mm_struct *mm, unsigned long addr,
+ struct vm_area_struct **vma)
+{
+ struct vm_area_struct *result;
+
+ result = find_vma(mm, addr);
+ if (!result || result->vm_ops != &sgx_vm_ops || addr < result->vm_start)
+ return -EINVAL;
+
+ *vma = result;
+
+ return 0;
+}
+
+int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
+ unsigned long end, unsigned long vm_flags);
+
+void sgx_encl_release(struct kref *ref);
+int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm);
+int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing);
+void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write);
+int sgx_encl_test_and_clear_young(struct mm_struct *mm,
+ struct sgx_encl_page *page);
+
+struct sgx_epc_page *sgx_alloc_va_page(void);
+unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page);
+void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset);
+bool sgx_va_page_full(struct sgx_va_page *va_page);
+
+#endif /* _X86_ENCL_H */
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
new file mode 100644
index 000000000000..443188fe7e70
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -0,0 +1,231 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_ENCLS_H
+#define _X86_ENCLS_H
+
+#include <linux/bitops.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/rwsem.h>
+#include <linux/types.h>
+#include <asm/asm.h>
+#include <asm/traps.h>
+#include "sgx.h"
+
+enum sgx_encls_function {
+ ECREATE = 0x00,
+ EADD = 0x01,
+ EINIT = 0x02,
+ EREMOVE = 0x03,
+ EDGBRD = 0x04,
+ EDGBWR = 0x05,
+ EEXTEND = 0x06,
+ ELDU = 0x08,
+ EBLOCK = 0x09,
+ EPA = 0x0A,
+ EWB = 0x0B,
+ ETRACK = 0x0C,
+};
+
+/**
+ * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
+ *
+ * ENCLS has its own (positive value) error codes and also generates
+ * ENCLS specific #GP and #PF faults. And the ENCLS values get munged
+ * with system error codes as everything percolates back up the stack.
+ * Unfortunately (for us), we need to precisely identify each unique
+ * error code, e.g. the action taken if EWB fails varies based on the
+ * type of fault and on the exact SGX error code, i.e. we can't simply
+ * convert all faults to -EFAULT.
+ *
+ * To make all three error types coexist, we set bit 30 to identify an
+ * ENCLS fault. Bit 31 (technically bits N:31) is used to differentiate
+ * between positive (faults and SGX error codes) and negative (system
+ * error codes) values.
+ */
+#define ENCLS_FAULT_FLAG 0x40000000
+
+/* Retrieve the encoded trapnr from the specified return code. */
+#define ENCLS_TRAPNR(r) ((r) & ~ENCLS_FAULT_FLAG)
+
+/* Issue a WARN() about an ENCLS function. */
+#define ENCLS_WARN(r, name) { \
+ do { \
+ int _r = (r); \
+ WARN_ONCE(_r, "%s returned %d (0x%x)\n", (name), _r, _r); \
+ } while (0); \
+}
+
+/**
+ * encls_failed() - Check if an ENCLS function failed
+ * @ret: the return value of an ENCLS function call
+ *
+ * Check if an ENCLS function failed. This happens when the function causes a
+ * fault that is not caused by an EPCM conflict or when the function returns a
+ * non-zero value.
+ */
+static inline bool encls_failed(int ret)
+{
+ if (ret & ENCLS_FAULT_FLAG)
+ return ENCLS_TRAPNR(ret) != X86_TRAP_PF;
+
+ return !!ret;
+}
+
+/**
+ * __encls_ret_N - encode an ENCLS function that returns an error code in EAX
+ * @rax: function number
+ * @inputs: asm inputs for the function
+ *
+ * Emit assembly for an ENCLS function that returns an error code, e.g. EREMOVE.
+ * And because SGX isn't complex enough as it is, function that return an error
+ * code also modify flags.
+ *
+ * Return:
+ * 0 on success,
+ * SGX error code on failure
+ */
+#define __encls_ret_N(rax, inputs...) \
+ ({ \
+ int ret; \
+ asm volatile( \
+ "1: .byte 0x0f, 0x01, 0xcf;\n\t" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE_FAULT(1b, 3b) \
+ : "=a"(ret) \
+ : "a"(rax), inputs \
+ : "memory", "cc"); \
+ ret; \
+ })
+
+#define __encls_ret_1(rax, rcx) \
+ ({ \
+ __encls_ret_N(rax, "c"(rcx)); \
+ })
+
+#define __encls_ret_2(rax, rbx, rcx) \
+ ({ \
+ __encls_ret_N(rax, "b"(rbx), "c"(rcx)); \
+ })
+
+#define __encls_ret_3(rax, rbx, rcx, rdx) \
+ ({ \
+ __encls_ret_N(rax, "b"(rbx), "c"(rcx), "d"(rdx)); \
+ })
+
+/**
+ * __encls_N - encode an ENCLS function that doesn't return an error code
+ * @rax: function number
+ * @rbx_out: optional output variable
+ * @inputs: asm inputs for the function
+ *
+ * Emit assembly for an ENCLS function that does not return an error code, e.g.
+ * ECREATE. Leaves without error codes either succeed or fault. @rbx_out is an
+ * optional parameter for use by EDGBRD, which returns the requested value in
+ * RBX.
+ *
+ * Return:
+ * 0 on success,
+ * trapnr with ENCLS_FAULT_FLAG set on fault
+ */
+#define __encls_N(rax, rbx_out, inputs...) \
+ ({ \
+ int ret; \
+ asm volatile( \
+ "1: .byte 0x0f, 0x01, 0xcf;\n\t" \
+ " xor %%eax,%%eax;\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: orl $"__stringify(ENCLS_FAULT_FLAG)",%%eax\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ _ASM_EXTABLE_FAULT(1b, 3b) \
+ : "=a"(ret), "=b"(rbx_out) \
+ : "a"(rax), inputs \
+ : "memory"); \
+ ret; \
+ })
+
+#define __encls_2(rax, rbx, rcx) \
+ ({ \
+ unsigned long ign_rbx_out; \
+ __encls_N(rax, ign_rbx_out, "b"(rbx), "c"(rcx)); \
+ })
+
+#define __encls_1_1(rax, data, rcx) \
+ ({ \
+ unsigned long rbx_out; \
+ int ret = __encls_N(rax, rbx_out, "c"(rcx)); \
+ if (!ret) \
+ data = rbx_out; \
+ ret; \
+ })
+
+static inline int __ecreate(struct sgx_pageinfo *pginfo, void *secs)
+{
+ return __encls_2(ECREATE, pginfo, secs);
+}
+
+static inline int __eextend(void *secs, void *addr)
+{
+ return __encls_2(EEXTEND, secs, addr);
+}
+
+static inline int __eadd(struct sgx_pageinfo *pginfo, void *addr)
+{
+ return __encls_2(EADD, pginfo, addr);
+}
+
+static inline int __einit(void *sigstruct, void *token, void *secs)
+{
+ return __encls_ret_3(EINIT, sigstruct, secs, token);
+}
+
+static inline int __eremove(void *addr)
+{
+ return __encls_ret_1(EREMOVE, addr);
+}
+
+static inline int __edbgwr(void *addr, unsigned long *data)
+{
+ return __encls_2(EDGBWR, *data, addr);
+}
+
+static inline int __edbgrd(void *addr, unsigned long *data)
+{
+ return __encls_1_1(EDGBRD, *data, addr);
+}
+
+static inline int __etrack(void *addr)
+{
+ return __encls_ret_1(ETRACK, addr);
+}
+
+static inline int __eldu(struct sgx_pageinfo *pginfo, void *addr,
+ void *va)
+{
+ return __encls_ret_3(ELDU, pginfo, addr, va);
+}
+
+static inline int __eblock(void *addr)
+{
+ return __encls_ret_1(EBLOCK, addr);
+}
+
+static inline int __epa(void *addr)
+{
+ unsigned long rbx = SGX_PAGE_TYPE_VA;
+
+ return __encls_2(EPA, rbx, addr);
+}
+
+static inline int __ewb(struct sgx_pageinfo *pginfo, void *addr,
+ void *va)
+{
+ return __encls_ret_3(EWB, pginfo, addr, va);
+}
+
+#endif /* _X86_ENCLS_H */
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
new file mode 100644
index 000000000000..90a5caf76939
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -0,0 +1,716 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <asm/mman.h>
+#include <linux/mman.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <linux/hashtable.h>
+#include <linux/highmem.h>
+#include <linux/ratelimit.h>
+#include <linux/sched/signal.h>
+#include <linux/shmem_fs.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include "driver.h"
+#include "encl.h"
+#include "encls.h"
+
+static struct sgx_va_page *sgx_encl_grow(struct sgx_encl *encl)
+{
+ struct sgx_va_page *va_page = NULL;
+ void *err;
+
+ BUILD_BUG_ON(SGX_VA_SLOT_COUNT !=
+ (SGX_ENCL_PAGE_VA_OFFSET_MASK >> 3) + 1);
+
+ if (!(encl->page_cnt % SGX_VA_SLOT_COUNT)) {
+ va_page = kzalloc(sizeof(*va_page), GFP_KERNEL);
+ if (!va_page)
+ return ERR_PTR(-ENOMEM);
+
+ va_page->epc_page = sgx_alloc_va_page();
+ if (IS_ERR(va_page->epc_page)) {
+ err = ERR_CAST(va_page->epc_page);
+ kfree(va_page);
+ return err;
+ }
+
+ WARN_ON_ONCE(encl->page_cnt % SGX_VA_SLOT_COUNT);
+ }
+ encl->page_cnt++;
+ return va_page;
+}
+
+static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page)
+{
+ encl->page_cnt--;
+
+ if (va_page) {
+ sgx_free_epc_page(va_page->epc_page);
+ list_del(&va_page->list);
+ kfree(va_page);
+ }
+}
+
+static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
+{
+ struct sgx_epc_page *secs_epc;
+ struct sgx_va_page *va_page;
+ struct sgx_pageinfo pginfo;
+ struct sgx_secinfo secinfo;
+ unsigned long encl_size;
+ struct file *backing;
+ long ret;
+
+ va_page = sgx_encl_grow(encl);
+ if (IS_ERR(va_page))
+ return PTR_ERR(va_page);
+ else if (va_page)
+ list_add(&va_page->list, &encl->va_pages);
+ /* else the tail page of the VA page list had free slots. */
+
+ /* The extra page goes to SECS. */
+ encl_size = secs->size + PAGE_SIZE;
+
+ backing = shmem_file_setup("SGX backing", encl_size + (encl_size >> 5),
+ VM_NORESERVE);
+ if (IS_ERR(backing)) {
+ ret = PTR_ERR(backing);
+ goto err_out_shrink;
+ }
+
+ encl->backing = backing;
+
+ secs_epc = sgx_alloc_epc_page(&encl->secs, true);
+ if (IS_ERR(secs_epc)) {
+ ret = PTR_ERR(secs_epc);
+ goto err_out_backing;
+ }
+
+ encl->secs.epc_page = secs_epc;
+
+ pginfo.addr = 0;
+ pginfo.contents = (unsigned long)secs;
+ pginfo.metadata = (unsigned long)&secinfo;
+ pginfo.secs = 0;
+ memset(&secinfo, 0, sizeof(secinfo));
+
+ ret = __ecreate((void *)&pginfo, sgx_get_epc_virt_addr(secs_epc));
+ if (ret) {
+ ret = -EIO;
+ goto err_out;
+ }
+
+ if (secs->attributes & SGX_ATTR_DEBUG)
+ set_bit(SGX_ENCL_DEBUG, &encl->flags);
+
+ encl->secs.encl = encl;
+ encl->base = secs->base;
+ encl->size = secs->size;
+ encl->attributes = secs->attributes;
+ encl->attributes_mask = SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | SGX_ATTR_KSS;
+
+ /* Set only after completion, as encl->lock has not been taken. */
+ set_bit(SGX_ENCL_CREATED, &encl->flags);
+
+ return 0;
+
+err_out:
+ sgx_free_epc_page(encl->secs.epc_page);
+ encl->secs.epc_page = NULL;
+
+err_out_backing:
+ fput(encl->backing);
+ encl->backing = NULL;
+
+err_out_shrink:
+ sgx_encl_shrink(encl, va_page);
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_create() - handler for %SGX_IOC_ENCLAVE_CREATE
+ * @encl: An enclave pointer.
+ * @arg: The ioctl argument.
+ *
+ * Allocate kernel data structures for the enclave and invoke ECREATE.
+ *
+ * Return:
+ * - 0: Success.
+ * - -EIO: ECREATE failed.
+ * - -errno: POSIX error.
+ */
+static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_enclave_create create_arg;
+ void *secs;
+ int ret;
+
+ if (test_bit(SGX_ENCL_CREATED, &encl->flags))
+ return -EINVAL;
+
+ if (copy_from_user(&create_arg, arg, sizeof(create_arg)))
+ return -EFAULT;
+
+ secs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!secs)
+ return -ENOMEM;
+
+ if (copy_from_user(secs, (void __user *)create_arg.src, PAGE_SIZE))
+ ret = -EFAULT;
+ else
+ ret = sgx_encl_create(encl, secs);
+
+ kfree(secs);
+ return ret;
+}
+
+static struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
+ unsigned long offset,
+ u64 secinfo_flags)
+{
+ struct sgx_encl_page *encl_page;
+ unsigned long prot;
+
+ encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL);
+ if (!encl_page)
+ return ERR_PTR(-ENOMEM);
+
+ encl_page->desc = encl->base + offset;
+ encl_page->encl = encl;
+
+ prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) |
+ _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) |
+ _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC);
+
+ /*
+ * TCS pages must always RW set for CPU access while the SECINFO
+ * permissions are *always* zero - the CPU ignores the user provided
+ * values and silently overwrites them with zero permissions.
+ */
+ if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS)
+ prot |= PROT_READ | PROT_WRITE;
+
+ /* Calculate maximum of the VM flags for the page. */
+ encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
+
+ return encl_page;
+}
+
+static int sgx_validate_secinfo(struct sgx_secinfo *secinfo)
+{
+ u64 perm = secinfo->flags & SGX_SECINFO_PERMISSION_MASK;
+ u64 pt = secinfo->flags & SGX_SECINFO_PAGE_TYPE_MASK;
+
+ if (pt != SGX_SECINFO_REG && pt != SGX_SECINFO_TCS)
+ return -EINVAL;
+
+ if ((perm & SGX_SECINFO_W) && !(perm & SGX_SECINFO_R))
+ return -EINVAL;
+
+ /*
+ * CPU will silently overwrite the permissions as zero, which means
+ * that we need to validate it ourselves.
+ */
+ if (pt == SGX_SECINFO_TCS && perm)
+ return -EINVAL;
+
+ if (secinfo->flags & SGX_SECINFO_RESERVED_MASK)
+ return -EINVAL;
+
+ if (memchr_inv(secinfo->reserved, 0, sizeof(secinfo->reserved)))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __sgx_encl_add_page(struct sgx_encl *encl,
+ struct sgx_encl_page *encl_page,
+ struct sgx_epc_page *epc_page,
+ struct sgx_secinfo *secinfo, unsigned long src)
+{
+ struct sgx_pageinfo pginfo;
+ struct vm_area_struct *vma;
+ struct page *src_page;
+ int ret;
+
+ /* Deny noexec. */
+ vma = find_vma(current->mm, src);
+ if (!vma)
+ return -EFAULT;
+
+ if (!(vma->vm_flags & VM_MAYEXEC))
+ return -EACCES;
+
+ ret = get_user_pages(src, 1, 0, &src_page, NULL);
+ if (ret < 1)
+ return -EFAULT;
+
+ pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page);
+ pginfo.addr = encl_page->desc & PAGE_MASK;
+ pginfo.metadata = (unsigned long)secinfo;
+ pginfo.contents = (unsigned long)kmap_atomic(src_page);
+
+ ret = __eadd(&pginfo, sgx_get_epc_virt_addr(epc_page));
+
+ kunmap_atomic((void *)pginfo.contents);
+ put_page(src_page);
+
+ return ret ? -EIO : 0;
+}
+
+/*
+ * If the caller requires measurement of the page as a proof for the content,
+ * use EEXTEND to add a measurement for 256 bytes of the page. Repeat this
+ * operation until the entire page is measured."
+ */
+static int __sgx_encl_extend(struct sgx_encl *encl,
+ struct sgx_epc_page *epc_page)
+{
+ unsigned long offset;
+ int ret;
+
+ for (offset = 0; offset < PAGE_SIZE; offset += SGX_EEXTEND_BLOCK_SIZE) {
+ ret = __eextend(sgx_get_epc_virt_addr(encl->secs.epc_page),
+ sgx_get_epc_virt_addr(epc_page) + offset);
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EEXTEND");
+
+ return -EIO;
+ }
+ }
+
+ return 0;
+}
+
+static int sgx_encl_add_page(struct sgx_encl *encl, unsigned long src,
+ unsigned long offset, struct sgx_secinfo *secinfo,
+ unsigned long flags)
+{
+ struct sgx_encl_page *encl_page;
+ struct sgx_epc_page *epc_page;
+ struct sgx_va_page *va_page;
+ int ret;
+
+ encl_page = sgx_encl_page_alloc(encl, offset, secinfo->flags);
+ if (IS_ERR(encl_page))
+ return PTR_ERR(encl_page);
+
+ epc_page = sgx_alloc_epc_page(encl_page, true);
+ if (IS_ERR(epc_page)) {
+ kfree(encl_page);
+ return PTR_ERR(epc_page);
+ }
+
+ va_page = sgx_encl_grow(encl);
+ if (IS_ERR(va_page)) {
+ ret = PTR_ERR(va_page);
+ goto err_out_free;
+ }
+
+ mmap_read_lock(current->mm);
+ mutex_lock(&encl->lock);
+
+ /*
+ * Adding to encl->va_pages must be done under encl->lock. Ditto for
+ * deleting (via sgx_encl_shrink()) in the error path.
+ */
+ if (va_page)
+ list_add(&va_page->list, &encl->va_pages);
+
+ /*
+ * Insert prior to EADD in case of OOM. EADD modifies MRENCLAVE, i.e.
+ * can't be gracefully unwound, while failure on EADD/EXTEND is limited
+ * to userspace errors (or kernel/hardware bugs).
+ */
+ ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc),
+ encl_page, GFP_KERNEL);
+ if (ret)
+ goto err_out_unlock;
+
+ ret = __sgx_encl_add_page(encl, encl_page, epc_page, secinfo,
+ src);
+ if (ret)
+ goto err_out;
+
+ /*
+ * Complete the "add" before doing the "extend" so that the "add"
+ * isn't in a half-baked state in the extremely unlikely scenario
+ * the enclave will be destroyed in response to EEXTEND failure.
+ */
+ encl_page->encl = encl;
+ encl_page->epc_page = epc_page;
+ encl->secs_child_cnt++;
+
+ if (flags & SGX_PAGE_MEASURE) {
+ ret = __sgx_encl_extend(encl, epc_page);
+ if (ret)
+ goto err_out;
+ }
+
+ sgx_mark_page_reclaimable(encl_page->epc_page);
+ mutex_unlock(&encl->lock);
+ mmap_read_unlock(current->mm);
+ return ret;
+
+err_out:
+ xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc));
+
+err_out_unlock:
+ sgx_encl_shrink(encl, va_page);
+ mutex_unlock(&encl->lock);
+ mmap_read_unlock(current->mm);
+
+err_out_free:
+ sgx_free_epc_page(epc_page);
+ kfree(encl_page);
+
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_add_pages() - The handler for %SGX_IOC_ENCLAVE_ADD_PAGES
+ * @encl: an enclave pointer
+ * @arg: a user pointer to a struct sgx_enclave_add_pages instance
+ *
+ * Add one or more pages to an uninitialized enclave, and optionally extend the
+ * measurement with the contents of the page. The SECINFO and measurement mask
+ * are applied to all pages.
+ *
+ * A SECINFO for a TCS is required to always contain zero permissions because
+ * CPU silently zeros them. Allowing anything else would cause a mismatch in
+ * the measurement.
+ *
+ * mmap()'s protection bits are capped by the page permissions. For each page
+ * address, the maximum protection bits are computed with the following
+ * heuristics:
+ *
+ * 1. A regular page: PROT_R, PROT_W and PROT_X match the SECINFO permissions.
+ * 2. A TCS page: PROT_R | PROT_W.
+ *
+ * mmap() is not allowed to surpass the minimum of the maximum protection bits
+ * within the given address range.
+ *
+ * The function deinitializes kernel data structures for enclave and returns
+ * -EIO in any of the following conditions:
+ *
+ * - Enclave Page Cache (EPC), the physical memory holding enclaves, has
+ * been invalidated. This will cause EADD and EEXTEND to fail.
+ * - If the source address is corrupted somehow when executing EADD.
+ *
+ * Return:
+ * - 0: Success.
+ * - -EACCES: The source page is located in a noexec partition.
+ * - -ENOMEM: Out of EPC pages.
+ * - -EINTR: The call was interrupted before data was processed.
+ * - -EIO: Either EADD or EEXTEND failed because invalid source address
+ * or power cycle.
+ * - -errno: POSIX error.
+ */
+static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_enclave_add_pages add_arg;
+ struct sgx_secinfo secinfo;
+ unsigned long c;
+ int ret;
+
+ if (!test_bit(SGX_ENCL_CREATED, &encl->flags) ||
+ test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return -EINVAL;
+
+ if (copy_from_user(&add_arg, arg, sizeof(add_arg)))
+ return -EFAULT;
+
+ if (!IS_ALIGNED(add_arg.offset, PAGE_SIZE) ||
+ !IS_ALIGNED(add_arg.src, PAGE_SIZE))
+ return -EINVAL;
+
+ if (!add_arg.length || add_arg.length & (PAGE_SIZE - 1))
+ return -EINVAL;
+
+ if (add_arg.offset + add_arg.length - PAGE_SIZE >= encl->size)
+ return -EINVAL;
+
+ if (copy_from_user(&secinfo, (void __user *)add_arg.secinfo,
+ sizeof(secinfo)))
+ return -EFAULT;
+
+ if (sgx_validate_secinfo(&secinfo))
+ return -EINVAL;
+
+ for (c = 0 ; c < add_arg.length; c += PAGE_SIZE) {
+ if (signal_pending(current)) {
+ if (!c)
+ ret = -ERESTARTSYS;
+
+ break;
+ }
+
+ if (need_resched())
+ cond_resched();
+
+ ret = sgx_encl_add_page(encl, add_arg.src + c, add_arg.offset + c,
+ &secinfo, add_arg.flags);
+ if (ret)
+ break;
+ }
+
+ add_arg.count = c;
+
+ if (copy_to_user(arg, &add_arg, sizeof(add_arg)))
+ return -EFAULT;
+
+ return ret;
+}
+
+static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus,
+ void *hash)
+{
+ SHASH_DESC_ON_STACK(shash, tfm);
+
+ shash->tfm = tfm;
+
+ return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash);
+}
+
+static int sgx_get_key_hash(const void *modulus, void *hash)
+{
+ struct crypto_shash *tfm;
+ int ret;
+
+ tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ ret = __sgx_get_key_hash(tfm, modulus, hash);
+
+ crypto_free_shash(tfm);
+ return ret;
+}
+
+static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
+ void *token)
+{
+ u64 mrsigner[4];
+ int i, j, k;
+ void *addr;
+ int ret;
+
+ /*
+ * Deny initializing enclaves with attributes (namely provisioning)
+ * that have not been explicitly allowed.
+ */
+ if (encl->attributes & ~encl->attributes_mask)
+ return -EACCES;
+
+ /*
+ * Attributes should not be enforced *only* against what's available on
+ * platform (done in sgx_encl_create) but checked and enforced against
+ * the mask for enforcement in sigstruct. For example an enclave could
+ * opt to sign with AVX bit in xfrm, but still be loadable on a platform
+ * without it if the sigstruct->body.attributes_mask does not turn that
+ * bit on.
+ */
+ if (sigstruct->body.attributes & sigstruct->body.attributes_mask &
+ sgx_attributes_reserved_mask)
+ return -EINVAL;
+
+ if (sigstruct->body.miscselect & sigstruct->body.misc_mask &
+ sgx_misc_reserved_mask)
+ return -EINVAL;
+
+ if (sigstruct->body.xfrm & sigstruct->body.xfrm_mask &
+ sgx_xfrm_reserved_mask)
+ return -EINVAL;
+
+ ret = sgx_get_key_hash(sigstruct->modulus, mrsigner);
+ if (ret)
+ return ret;
+
+ mutex_lock(&encl->lock);
+
+ /*
+ * ENCLS[EINIT] is interruptible because it has such a high latency,
+ * e.g. 50k+ cycles on success. If an IRQ/NMI/SMI becomes pending,
+ * EINIT may fail with SGX_UNMASKED_EVENT so that the event can be
+ * serviced.
+ */
+ for (i = 0; i < SGX_EINIT_SLEEP_COUNT; i++) {
+ for (j = 0; j < SGX_EINIT_SPIN_COUNT; j++) {
+ addr = sgx_get_epc_virt_addr(encl->secs.epc_page);
+
+ preempt_disable();
+
+ for (k = 0; k < 4; k++)
+ wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]);
+
+ ret = __einit(sigstruct, token, addr);
+
+ preempt_enable();
+
+ if (ret == SGX_UNMASKED_EVENT)
+ continue;
+ else
+ break;
+ }
+
+ if (ret != SGX_UNMASKED_EVENT)
+ break;
+
+ msleep_interruptible(SGX_EINIT_SLEEP_TIME);
+
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ goto err_out;
+ }
+ }
+
+ if (ret & ENCLS_FAULT_FLAG) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EINIT");
+
+ ret = -EIO;
+ } else if (ret) {
+ pr_debug("EINIT returned %d\n", ret);
+ ret = -EPERM;
+ } else {
+ set_bit(SGX_ENCL_INITIALIZED, &encl->flags);
+ }
+
+err_out:
+ mutex_unlock(&encl->lock);
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_init() - handler for %SGX_IOC_ENCLAVE_INIT
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a struct sgx_enclave_init instance
+ *
+ * Flush any outstanding enqueued EADD operations and perform EINIT. The
+ * Launch Enclave Public Key Hash MSRs are rewritten as necessary to match
+ * the enclave's MRSIGNER, which is caculated from the provided sigstruct.
+ *
+ * Return:
+ * - 0: Success.
+ * - -EPERM: Invalid SIGSTRUCT.
+ * - -EIO: EINIT failed because of a power cycle.
+ * - -errno: POSIX error.
+ */
+static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_sigstruct *sigstruct;
+ struct sgx_enclave_init init_arg;
+ struct page *initp_page;
+ void *token;
+ int ret;
+
+ if (!test_bit(SGX_ENCL_CREATED, &encl->flags) ||
+ test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
+ return -EINVAL;
+
+ if (copy_from_user(&init_arg, arg, sizeof(init_arg)))
+ return -EFAULT;
+
+ initp_page = alloc_page(GFP_KERNEL);
+ if (!initp_page)
+ return -ENOMEM;
+
+ sigstruct = kmap(initp_page);
+ token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2);
+ memset(token, 0, SGX_LAUNCH_TOKEN_SIZE);
+
+ if (copy_from_user(sigstruct, (void __user *)init_arg.sigstruct,
+ sizeof(*sigstruct))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * A legacy field used with Intel signed enclaves. These used to mean
+ * regular and architectural enclaves. The CPU only accepts these values
+ * but they do not have any other meaning.
+ *
+ * Thus, reject any other values.
+ */
+ if (sigstruct->header.vendor != 0x0000 &&
+ sigstruct->header.vendor != 0x8086) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = sgx_encl_init(encl, sigstruct, token);
+
+out:
+ kunmap(initp_page);
+ __free_page(initp_page);
+ return ret;
+}
+
+/**
+ * sgx_ioc_enclave_provision() - handler for %SGX_IOC_ENCLAVE_PROVISION
+ * @encl: an enclave pointer
+ * @arg: userspace pointer to a struct sgx_enclave_provision instance
+ *
+ * Allow ATTRIBUTE.PROVISION_KEY for an enclave by providing a file handle to
+ * /dev/sgx_provision.
+ *
+ * Return:
+ * - 0: Success.
+ * - -errno: Otherwise.
+ */
+static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg)
+{
+ struct sgx_enclave_provision params;
+ struct file *file;
+
+ if (copy_from_user(&params, arg, sizeof(params)))
+ return -EFAULT;
+
+ file = fget(params.fd);
+ if (!file)
+ return -EINVAL;
+
+ if (file->f_op != &sgx_provision_fops) {
+ fput(file);
+ return -EINVAL;
+ }
+
+ encl->attributes_mask |= SGX_ATTR_PROVISIONKEY;
+
+ fput(file);
+ return 0;
+}
+
+long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+ struct sgx_encl *encl = filep->private_data;
+ int ret;
+
+ if (test_and_set_bit(SGX_ENCL_IOCTL, &encl->flags))
+ return -EBUSY;
+
+ switch (cmd) {
+ case SGX_IOC_ENCLAVE_CREATE:
+ ret = sgx_ioc_enclave_create(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_ADD_PAGES:
+ ret = sgx_ioc_enclave_add_pages(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_INIT:
+ ret = sgx_ioc_enclave_init(encl, (void __user *)arg);
+ break;
+ case SGX_IOC_ENCLAVE_PROVISION:
+ ret = sgx_ioc_enclave_provision(encl, (void __user *)arg);
+ break;
+ default:
+ ret = -ENOIOCTLCMD;
+ break;
+ }
+
+ clear_bit(SGX_ENCL_IOCTL, &encl->flags);
+ return ret;
+}
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
new file mode 100644
index 000000000000..c519fc5f6948
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -0,0 +1,733 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2016-20 Intel Corporation. */
+
+#include <linux/freezer.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/ratelimit.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include "driver.h"
+#include "encl.h"
+#include "encls.h"
+
+struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
+static int sgx_nr_epc_sections;
+static struct task_struct *ksgxd_tsk;
+static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
+
+/*
+ * These variables are part of the state of the reclaimer, and must be accessed
+ * with sgx_reclaimer_lock acquired.
+ */
+static LIST_HEAD(sgx_active_page_list);
+
+static DEFINE_SPINLOCK(sgx_reclaimer_lock);
+
+/*
+ * Reset dirty EPC pages to uninitialized state. Laundry can be left with SECS
+ * pages whose child pages blocked EREMOVE.
+ */
+static void sgx_sanitize_section(struct sgx_epc_section *section)
+{
+ struct sgx_epc_page *page;
+ LIST_HEAD(dirty);
+ int ret;
+
+ /* init_laundry_list is thread-local, no need for a lock: */
+ while (!list_empty(&section->init_laundry_list)) {
+ if (kthread_should_stop())
+ return;
+
+ /* needed for access to ->page_list: */
+ spin_lock(&section->lock);
+
+ page = list_first_entry(&section->init_laundry_list,
+ struct sgx_epc_page, list);
+
+ ret = __eremove(sgx_get_epc_virt_addr(page));
+ if (!ret)
+ list_move(&page->list, &section->page_list);
+ else
+ list_move_tail(&page->list, &dirty);
+
+ spin_unlock(&section->lock);
+
+ cond_resched();
+ }
+
+ list_splice(&dirty, &section->init_laundry_list);
+}
+
+static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
+{
+ struct sgx_encl_page *page = epc_page->owner;
+ struct sgx_encl *encl = page->encl;
+ struct sgx_encl_mm *encl_mm;
+ bool ret = true;
+ int idx;
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ mmap_read_lock(encl_mm->mm);
+ ret = !sgx_encl_test_and_clear_young(encl_mm->mm, page);
+ mmap_read_unlock(encl_mm->mm);
+
+ mmput_async(encl_mm->mm);
+
+ if (!ret)
+ break;
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ if (!ret)
+ return false;
+
+ return true;
+}
+
+static void sgx_reclaimer_block(struct sgx_epc_page *epc_page)
+{
+ struct sgx_encl_page *page = epc_page->owner;
+ unsigned long addr = page->desc & PAGE_MASK;
+ struct sgx_encl *encl = page->encl;
+ unsigned long mm_list_version;
+ struct sgx_encl_mm *encl_mm;
+ struct vm_area_struct *vma;
+ int idx, ret;
+
+ do {
+ mm_list_version = encl->mm_list_version;
+
+ /* Pairs with smp_rmb() in sgx_encl_mm_add(). */
+ smp_rmb();
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ mmap_read_lock(encl_mm->mm);
+
+ ret = sgx_encl_find(encl_mm->mm, addr, &vma);
+ if (!ret && encl == vma->vm_private_data)
+ zap_vma_ptes(vma, addr, PAGE_SIZE);
+
+ mmap_read_unlock(encl_mm->mm);
+
+ mmput_async(encl_mm->mm);
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+ } while (unlikely(encl->mm_list_version != mm_list_version));
+
+ mutex_lock(&encl->lock);
+
+ ret = __eblock(sgx_get_epc_virt_addr(epc_page));
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EBLOCK");
+
+ mutex_unlock(&encl->lock);
+}
+
+static int __sgx_encl_ewb(struct sgx_epc_page *epc_page, void *va_slot,
+ struct sgx_backing *backing)
+{
+ struct sgx_pageinfo pginfo;
+ int ret;
+
+ pginfo.addr = 0;
+ pginfo.secs = 0;
+
+ pginfo.contents = (unsigned long)kmap_atomic(backing->contents);
+ pginfo.metadata = (unsigned long)kmap_atomic(backing->pcmd) +
+ backing->pcmd_offset;
+
+ ret = __ewb(&pginfo, sgx_get_epc_virt_addr(epc_page), va_slot);
+
+ kunmap_atomic((void *)(unsigned long)(pginfo.metadata -
+ backing->pcmd_offset));
+ kunmap_atomic((void *)(unsigned long)pginfo.contents);
+
+ return ret;
+}
+
+static void sgx_ipi_cb(void *info)
+{
+}
+
+static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl)
+{
+ cpumask_t *cpumask = &encl->cpumask;
+ struct sgx_encl_mm *encl_mm;
+ int idx;
+
+ /*
+ * Can race with sgx_encl_mm_add(), but ETRACK has already been
+ * executed, which means that the CPUs running in the new mm will enter
+ * into the enclave with a fresh epoch.
+ */
+ cpumask_clear(cpumask);
+
+ idx = srcu_read_lock(&encl->srcu);
+
+ list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
+ if (!mmget_not_zero(encl_mm->mm))
+ continue;
+
+ cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm));
+
+ mmput_async(encl_mm->mm);
+ }
+
+ srcu_read_unlock(&encl->srcu, idx);
+
+ return cpumask;
+}
+
+/*
+ * Swap page to the regular memory transformed to the blocked state by using
+ * EBLOCK, which means that it can no loger be referenced (no new TLB entries).
+ *
+ * The first trial just tries to write the page assuming that some other thread
+ * has reset the count for threads inside the enlave by using ETRACK, and
+ * previous thread count has been zeroed out. The second trial calls ETRACK
+ * before EWB. If that fails we kick all the HW threads out, and then do EWB,
+ * which should be guaranteed the succeed.
+ */
+static void sgx_encl_ewb(struct sgx_epc_page *epc_page,
+ struct sgx_backing *backing)
+{
+ struct sgx_encl_page *encl_page = epc_page->owner;
+ struct sgx_encl *encl = encl_page->encl;
+ struct sgx_va_page *va_page;
+ unsigned int va_offset;
+ void *va_slot;
+ int ret;
+
+ encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED;
+
+ va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
+ list);
+ va_offset = sgx_alloc_va_slot(va_page);
+ va_slot = sgx_get_epc_virt_addr(va_page->epc_page) + va_offset;
+ if (sgx_va_page_full(va_page))
+ list_move_tail(&va_page->list, &encl->va_pages);
+
+ ret = __sgx_encl_ewb(epc_page, va_slot, backing);
+ if (ret == SGX_NOT_TRACKED) {
+ ret = __etrack(sgx_get_epc_virt_addr(encl->secs.epc_page));
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "ETRACK");
+ }
+
+ ret = __sgx_encl_ewb(epc_page, va_slot, backing);
+ if (ret == SGX_NOT_TRACKED) {
+ /*
+ * Slow path, send IPIs to kick cpus out of the
+ * enclave. Note, it's imperative that the cpu
+ * mask is generated *after* ETRACK, else we'll
+ * miss cpus that entered the enclave between
+ * generating the mask and incrementing epoch.
+ */
+ on_each_cpu_mask(sgx_encl_ewb_cpumask(encl),
+ sgx_ipi_cb, NULL, 1);
+ ret = __sgx_encl_ewb(epc_page, va_slot, backing);
+ }
+ }
+
+ if (ret) {
+ if (encls_failed(ret))
+ ENCLS_WARN(ret, "EWB");
+
+ sgx_free_va_slot(va_page, va_offset);
+ } else {
+ encl_page->desc |= va_offset;
+ encl_page->va_page = va_page;
+ }
+}
+
+static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
+ struct sgx_backing *backing)
+{
+ struct sgx_encl_page *encl_page = epc_page->owner;
+ struct sgx_encl *encl = encl_page->encl;
+ struct sgx_backing secs_backing;
+ int ret;
+
+ mutex_lock(&encl->lock);
+
+ sgx_encl_ewb(epc_page, backing);
+ encl_page->epc_page = NULL;
+ encl->secs_child_cnt--;
+
+ if (!encl->secs_child_cnt && test_bit(SGX_ENCL_INITIALIZED, &encl->flags)) {
+ ret = sgx_encl_get_backing(encl, PFN_DOWN(encl->size),
+ &secs_backing);
+ if (ret)
+ goto out;
+
+ sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
+
+ sgx_free_epc_page(encl->secs.epc_page);
+ encl->secs.epc_page = NULL;
+
+ sgx_encl_put_backing(&secs_backing, true);
+ }
+
+out:
+ mutex_unlock(&encl->lock);
+}
+
+/*
+ * Take a fixed number of pages from the head of the active page pool and
+ * reclaim them to the enclave's private shmem files. Skip the pages, which have
+ * been accessed since the last scan. Move those pages to the tail of active
+ * page pool so that the pages get scanned in LRU like fashion.
+ *
+ * Batch process a chunk of pages (at the moment 16) in order to degrade amount
+ * of IPI's and ETRACK's potentially required. sgx_encl_ewb() does degrade a bit
+ * among the HW threads with three stage EWB pipeline (EWB, ETRACK + EWB and IPI
+ * + EWB) but not sufficiently. Reclaiming one page at a time would also be
+ * problematic as it would increase the lock contention too much, which would
+ * halt forward progress.
+ */
+static void sgx_reclaim_pages(void)
+{
+ struct sgx_epc_page *chunk[SGX_NR_TO_SCAN];
+ struct sgx_backing backing[SGX_NR_TO_SCAN];
+ struct sgx_epc_section *section;
+ struct sgx_encl_page *encl_page;
+ struct sgx_epc_page *epc_page;
+ pgoff_t page_index;
+ int cnt = 0;
+ int ret;
+ int i;
+
+ spin_lock(&sgx_reclaimer_lock);
+ for (i = 0; i < SGX_NR_TO_SCAN; i++) {
+ if (list_empty(&sgx_active_page_list))
+ break;
+
+ epc_page = list_first_entry(&sgx_active_page_list,
+ struct sgx_epc_page, list);
+ list_del_init(&epc_page->list);
+ encl_page = epc_page->owner;
+
+ if (kref_get_unless_zero(&encl_page->encl->refcount) != 0)
+ chunk[cnt++] = epc_page;
+ else
+ /* The owner is freeing the page. No need to add the
+ * page back to the list of reclaimable pages.
+ */
+ epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
+ }
+ spin_unlock(&sgx_reclaimer_lock);
+
+ for (i = 0; i < cnt; i++) {
+ epc_page = chunk[i];
+ encl_page = epc_page->owner;
+
+ if (!sgx_reclaimer_age(epc_page))
+ goto skip;
+
+ page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
+ ret = sgx_encl_get_backing(encl_page->encl, page_index, &backing[i]);
+ if (ret)
+ goto skip;
+
+ mutex_lock(&encl_page->encl->lock);
+ encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED;
+ mutex_unlock(&encl_page->encl->lock);
+ continue;
+
+skip:
+ spin_lock(&sgx_reclaimer_lock);
+ list_add_tail(&epc_page->list, &sgx_active_page_list);
+ spin_unlock(&sgx_reclaimer_lock);
+
+ kref_put(&encl_page->encl->refcount, sgx_encl_release);
+
+ chunk[i] = NULL;
+ }
+
+ for (i = 0; i < cnt; i++) {
+ epc_page = chunk[i];
+ if (epc_page)
+ sgx_reclaimer_block(epc_page);
+ }
+
+ for (i = 0; i < cnt; i++) {
+ epc_page = chunk[i];
+ if (!epc_page)
+ continue;
+
+ encl_page = epc_page->owner;
+ sgx_reclaimer_write(epc_page, &backing[i]);
+ sgx_encl_put_backing(&backing[i], true);
+
+ kref_put(&encl_page->encl->refcount, sgx_encl_release);
+ epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
+
+ section = &sgx_epc_sections[epc_page->section];
+ spin_lock(&section->lock);
+ list_add_tail(&epc_page->list, &section->page_list);
+ section->free_cnt++;
+ spin_unlock(&section->lock);
+ }
+}
+
+static unsigned long sgx_nr_free_pages(void)
+{
+ unsigned long cnt = 0;
+ int i;
+
+ for (i = 0; i < sgx_nr_epc_sections; i++)
+ cnt += sgx_epc_sections[i].free_cnt;
+
+ return cnt;
+}
+
+static bool sgx_should_reclaim(unsigned long watermark)
+{
+ return sgx_nr_free_pages() < watermark &&
+ !list_empty(&sgx_active_page_list);
+}
+
+static int ksgxd(void *p)
+{
+ int i;
+
+ set_freezable();
+
+ /*
+ * Sanitize pages in order to recover from kexec(). The 2nd pass is
+ * required for SECS pages, whose child pages blocked EREMOVE.
+ */
+ for (i = 0; i < sgx_nr_epc_sections; i++)
+ sgx_sanitize_section(&sgx_epc_sections[i]);
+
+ for (i = 0; i < sgx_nr_epc_sections; i++) {
+ sgx_sanitize_section(&sgx_epc_sections[i]);
+
+ /* Should never happen. */
+ if (!list_empty(&sgx_epc_sections[i].init_laundry_list))
+ WARN(1, "EPC section %d has unsanitized pages.\n", i);
+ }
+
+ while (!kthread_should_stop()) {
+ if (try_to_freeze())
+ continue;
+
+ wait_event_freezable(ksgxd_waitq,
+ kthread_should_stop() ||
+ sgx_should_reclaim(SGX_NR_HIGH_PAGES));
+
+ if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
+ sgx_reclaim_pages();
+
+ cond_resched();
+ }
+
+ return 0;
+}
+
+static bool __init sgx_page_reclaimer_init(void)
+{
+ struct task_struct *tsk;
+
+ tsk = kthread_run(ksgxd, NULL, "ksgxd");
+ if (IS_ERR(tsk))
+ return false;
+
+ ksgxd_tsk = tsk;
+
+ return true;
+}
+
+static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section)
+{
+ struct sgx_epc_page *page;
+
+ spin_lock(&section->lock);
+
+ if (list_empty(&section->page_list)) {
+ spin_unlock(&section->lock);
+ return NULL;
+ }
+
+ page = list_first_entry(&section->page_list, struct sgx_epc_page, list);
+ list_del_init(&page->list);
+ section->free_cnt--;
+
+ spin_unlock(&section->lock);
+ return page;
+}
+
+/**
+ * __sgx_alloc_epc_page() - Allocate an EPC page
+ *
+ * Iterate through EPC sections and borrow a free EPC page to the caller. When a
+ * page is no longer needed it must be released with sgx_free_epc_page().
+ *
+ * Return:
+ * an EPC page,
+ * -errno on error
+ */
+struct sgx_epc_page *__sgx_alloc_epc_page(void)
+{
+ struct sgx_epc_section *section;
+ struct sgx_epc_page *page;
+ int i;
+
+ for (i = 0; i < sgx_nr_epc_sections; i++) {
+ section = &sgx_epc_sections[i];
+
+ page = __sgx_alloc_epc_page_from_section(section);
+ if (page)
+ return page;
+ }
+
+ return ERR_PTR(-ENOMEM);
+}
+
+/**
+ * sgx_mark_page_reclaimable() - Mark a page as reclaimable
+ * @page: EPC page
+ *
+ * Mark a page as reclaimable and add it to the active page list. Pages
+ * are automatically removed from the active list when freed.
+ */
+void sgx_mark_page_reclaimable(struct sgx_epc_page *page)
+{
+ spin_lock(&sgx_reclaimer_lock);
+ page->flags |= SGX_EPC_PAGE_RECLAIMER_TRACKED;
+ list_add_tail(&page->list, &sgx_active_page_list);
+ spin_unlock(&sgx_reclaimer_lock);
+}
+
+/**
+ * sgx_unmark_page_reclaimable() - Remove a page from the reclaim list
+ * @page: EPC page
+ *
+ * Clear the reclaimable flag and remove the page from the active page list.
+ *
+ * Return:
+ * 0 on success,
+ * -EBUSY if the page is in the process of being reclaimed
+ */
+int sgx_unmark_page_reclaimable(struct sgx_epc_page *page)
+{
+ spin_lock(&sgx_reclaimer_lock);
+ if (page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED) {
+ /* The page is being reclaimed. */
+ if (list_empty(&page->list)) {
+ spin_unlock(&sgx_reclaimer_lock);
+ return -EBUSY;
+ }
+
+ list_del(&page->list);
+ page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
+ }
+ spin_unlock(&sgx_reclaimer_lock);
+
+ return 0;
+}
+
+/**
+ * sgx_alloc_epc_page() - Allocate an EPC page
+ * @owner: the owner of the EPC page
+ * @reclaim: reclaim pages if necessary
+ *
+ * Iterate through EPC sections and borrow a free EPC page to the caller. When a
+ * page is no longer needed it must be released with sgx_free_epc_page(). If
+ * @reclaim is set to true, directly reclaim pages when we are out of pages. No
+ * mm's can be locked when @reclaim is set to true.
+ *
+ * Finally, wake up ksgxd when the number of pages goes below the watermark
+ * before returning back to the caller.
+ *
+ * Return:
+ * an EPC page,
+ * -errno on error
+ */
+struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
+{
+ struct sgx_epc_page *page;
+
+ for ( ; ; ) {
+ page = __sgx_alloc_epc_page();
+ if (!IS_ERR(page)) {
+ page->owner = owner;
+ break;
+ }
+
+ if (list_empty(&sgx_active_page_list))
+ return ERR_PTR(-ENOMEM);
+
+ if (!reclaim) {
+ page = ERR_PTR(-EBUSY);
+ break;
+ }
+
+ if (signal_pending(current)) {
+ page = ERR_PTR(-ERESTARTSYS);
+ break;
+ }
+
+ sgx_reclaim_pages();
+ cond_resched();
+ }
+
+ if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
+ wake_up(&ksgxd_waitq);
+
+ return page;
+}
+
+/**
+ * sgx_free_epc_page() - Free an EPC page
+ * @page: an EPC page
+ *
+ * Call EREMOVE for an EPC page and insert it back to the list of free pages.
+ */
+void sgx_free_epc_page(struct sgx_epc_page *page)
+{
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ int ret;
+
+ WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+
+ ret = __eremove(sgx_get_epc_virt_addr(page));
+ if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret))
+ return;
+
+ spin_lock(&section->lock);
+ list_add_tail(&page->list, &section->page_list);
+ section->free_cnt++;
+ spin_unlock(&section->lock);
+}
+
+static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
+ unsigned long index,
+ struct sgx_epc_section *section)
+{
+ unsigned long nr_pages = size >> PAGE_SHIFT;
+ unsigned long i;
+
+ section->virt_addr = memremap(phys_addr, size, MEMREMAP_WB);
+ if (!section->virt_addr)
+ return false;
+
+ section->pages = vmalloc(nr_pages * sizeof(struct sgx_epc_page));
+ if (!section->pages) {
+ memunmap(section->virt_addr);
+ return false;
+ }
+
+ section->phys_addr = phys_addr;
+ spin_lock_init(&section->lock);
+ INIT_LIST_HEAD(&section->page_list);
+ INIT_LIST_HEAD(&section->init_laundry_list);
+
+ for (i = 0; i < nr_pages; i++) {
+ section->pages[i].section = index;
+ section->pages[i].flags = 0;
+ section->pages[i].owner = NULL;
+ list_add_tail(&section->pages[i].list, &section->init_laundry_list);
+ }
+
+ section->free_cnt = nr_pages;
+ return true;
+}
+
+/**
+ * A section metric is concatenated in a way that @low bits 12-31 define the
+ * bits 12-31 of the metric and @high bits 0-19 define the bits 32-51 of the
+ * metric.
+ */
+static inline u64 __init sgx_calc_section_metric(u64 low, u64 high)
+{
+ return (low & GENMASK_ULL(31, 12)) +
+ ((high & GENMASK_ULL(19, 0)) << 32);
+}
+
+static bool __init sgx_page_cache_init(void)
+{
+ u32 eax, ebx, ecx, edx, type;
+ u64 pa, size;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
+ cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
+
+ type = eax & SGX_CPUID_EPC_MASK;
+ if (type == SGX_CPUID_EPC_INVALID)
+ break;
+
+ if (type != SGX_CPUID_EPC_SECTION) {
+ pr_err_once("Unknown EPC section type: %u\n", type);
+ break;
+ }
+
+ pa = sgx_calc_section_metric(eax, ebx);
+ size = sgx_calc_section_metric(ecx, edx);
+
+ pr_info("EPC section 0x%llx-0x%llx\n", pa, pa + size - 1);
+
+ if (!sgx_setup_epc_section(pa, size, i, &sgx_epc_sections[i])) {
+ pr_err("No free memory for an EPC section\n");
+ break;
+ }
+
+ sgx_nr_epc_sections++;
+ }
+
+ if (!sgx_nr_epc_sections) {
+ pr_err("There are zero EPC sections.\n");
+ return false;
+ }
+
+ return true;
+}
+
+static void __init sgx_init(void)
+{
+ int ret;
+ int i;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SGX))
+ return;
+
+ if (!sgx_page_cache_init())
+ return;
+
+ if (!sgx_page_reclaimer_init())
+ goto err_page_cache;
+
+ ret = sgx_drv_init();
+ if (ret)
+ goto err_kthread;
+
+ return;
+
+err_kthread:
+ kthread_stop(ksgxd_tsk);
+
+err_page_cache:
+ for (i = 0; i < sgx_nr_epc_sections; i++) {
+ vfree(sgx_epc_sections[i].pages);
+ memunmap(sgx_epc_sections[i].virt_addr);
+ }
+}
+
+device_initcall(sgx_init);
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
new file mode 100644
index 000000000000..5fa42d143feb
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _X86_SGX_H
+#define _X86_SGX_H
+
+#include <linux/bitops.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/rwsem.h>
+#include <linux/types.h>
+#include <asm/asm.h>
+#include "arch.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) "sgx: " fmt
+
+#define SGX_MAX_EPC_SECTIONS 8
+#define SGX_EEXTEND_BLOCK_SIZE 256
+#define SGX_NR_TO_SCAN 16
+#define SGX_NR_LOW_PAGES 32
+#define SGX_NR_HIGH_PAGES 64
+
+/* Pages, which are being tracked by the page reclaimer. */
+#define SGX_EPC_PAGE_RECLAIMER_TRACKED BIT(0)
+
+struct sgx_epc_page {
+ unsigned int section;
+ unsigned int flags;
+ struct sgx_encl_page *owner;
+ struct list_head list;
+};
+
+/*
+ * The firmware can define multiple chunks of EPC to the different areas of the
+ * physical memory e.g. for memory areas of the each node. This structure is
+ * used to store EPC pages for one EPC section and virtual memory area where
+ * the pages have been mapped.
+ *
+ * 'lock' must be held before accessing 'page_list' or 'free_cnt'.
+ */
+struct sgx_epc_section {
+ unsigned long phys_addr;
+ void *virt_addr;
+ struct sgx_epc_page *pages;
+
+ spinlock_t lock;
+ struct list_head page_list;
+ unsigned long free_cnt;
+
+ /*
+ * Pages which need EREMOVE run on them before they can be
+ * used. Only safe to be accessed in ksgxd and init code.
+ * Not protected by locks.
+ */
+ struct list_head init_laundry_list;
+};
+
+extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
+
+static inline unsigned long sgx_get_epc_phys_addr(struct sgx_epc_page *page)
+{
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ unsigned long index;
+
+ index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page);
+
+ return section->phys_addr + index * PAGE_SIZE;
+}
+
+static inline void *sgx_get_epc_virt_addr(struct sgx_epc_page *page)
+{
+ struct sgx_epc_section *section = &sgx_epc_sections[page->section];
+ unsigned long index;
+
+ index = ((unsigned long)page - (unsigned long)section->pages) / sizeof(*page);
+
+ return section->virt_addr + index * PAGE_SIZE;
+}
+
+struct sgx_epc_page *__sgx_alloc_epc_page(void);
+void sgx_free_epc_page(struct sgx_epc_page *page);
+
+void sgx_mark_page_reclaimable(struct sgx_epc_page *page);
+int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
+struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
+
+#endif /* _X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index d3a0791bc052..1068002c8532 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -96,6 +96,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c)
unsigned int ht_mask_width, core_plus_mask_width, die_plus_mask_width;
unsigned int core_select_mask, core_level_siblings;
unsigned int die_select_mask, die_level_siblings;
+ bool die_level_present = false;
int leaf;
leaf = detect_extended_topology_leaf(c);
@@ -126,6 +127,7 @@ int detect_extended_topology(struct cpuinfo_x86 *c)
die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
}
if (LEAFB_SUBTYPE(ecx) == DIE_TYPE) {
+ die_level_present = true;
die_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
die_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
}
@@ -139,8 +141,12 @@ int detect_extended_topology(struct cpuinfo_x86 *c)
c->cpu_core_id = apic->phys_pkg_id(c->initial_apicid,
ht_mask_width) & core_select_mask;
- c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid,
- core_plus_mask_width) & die_select_mask;
+
+ if (die_level_present) {
+ c->cpu_die_id = apic->phys_pkg_id(c->initial_apicid,
+ core_plus_mask_width) & die_select_mask;
+ }
+
c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid,
die_plus_mask_width);
/*
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 924571fe5864..c6ede3b3d302 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -501,12 +501,12 @@ static bool vmware_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
ghcb_rbp_is_valid(ghcb)))
return false;
- regs->bx = ghcb->save.rbx;
- regs->cx = ghcb->save.rcx;
- regs->dx = ghcb->save.rdx;
- regs->si = ghcb->save.rsi;
- regs->di = ghcb->save.rdi;
- regs->bp = ghcb->save.rbp;
+ regs->bx = ghcb_get_rbx(ghcb);
+ regs->cx = ghcb_get_rcx(ghcb);
+ regs->dx = ghcb_get_rdx(ghcb);
+ regs->si = ghcb_get_rsi(ghcb);
+ regs->di = ghcb_get_rdi(ghcb);
+ regs->bp = ghcb_get_rbp(ghcb);
return true;
}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 3492aa36bf09..6f7b8cc1bc9f 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -74,10 +74,9 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
init_completion(&cmd.done);
for (; count; count -= 16) {
- call_single_data_t csd = {
- .func = cpuid_smp_cpuid,
- .info = &cmd,
- };
+ call_single_data_t csd;
+
+ INIT_CSD(&csd, cpuid_smp_cpuid, &cmd);
cmd.regs.eax = pos;
cmd.regs.ecx = pos >> 32;
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 33ee47670b99..5fcac46aaf6b 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -13,8 +13,6 @@
#include <linux/uaccess.h>
-static void *kdump_buf_page;
-
static inline bool is_crashed_pfn_valid(unsigned long pfn)
{
#ifndef CONFIG_X86_PAE
@@ -41,15 +39,11 @@ static inline bool is_crashed_pfn_valid(unsigned long pfn)
* @userbuf: if set, @buf is in user address space, use copy_to_user(),
* otherwise @buf is in kernel address space, use memcpy().
*
- * Copy a page from "oldmem". For this page, there is no pte mapped
- * in the current kernel. We stitch up a pte, similar to kmap_atomic.
- *
- * Calling copy_to_user() in atomic context is not desirable. Hence first
- * copying the data to a pre-allocated kernel page and then copying to user
- * space in non-atomic context.
+ * Copy a page from "oldmem". For this page, there might be no pte mapped
+ * in the current kernel.
*/
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
- size_t csize, unsigned long offset, int userbuf)
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
+ unsigned long offset, int userbuf)
{
void *vaddr;
@@ -59,38 +53,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
if (!is_crashed_pfn_valid(pfn))
return -EFAULT;
- vaddr = kmap_atomic_pfn(pfn);
+ vaddr = kmap_local_pfn(pfn);
if (!userbuf) {
- memcpy(buf, (vaddr + offset), csize);
- kunmap_atomic(vaddr);
+ memcpy(buf, vaddr + offset, csize);
} else {
- if (!kdump_buf_page) {
- printk(KERN_WARNING "Kdump: Kdump buffer page not"
- " allocated\n");
- kunmap_atomic(vaddr);
- return -EFAULT;
- }
- copy_page(kdump_buf_page, vaddr);
- kunmap_atomic(vaddr);
- if (copy_to_user(buf, (kdump_buf_page + offset), csize))
- return -EFAULT;
+ if (copy_to_user(buf, vaddr + offset, csize))
+ csize = -EFAULT;
}
- return csize;
-}
+ kunmap_local(vaddr);
-static int __init kdump_buf_page_init(void)
-{
- int ret = 0;
-
- kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!kdump_buf_page) {
- printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer"
- " page\n");
- ret = -ENOMEM;
- }
-
- return ret;
+ return csize;
}
-arch_initcall(kdump_buf_page_init);
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index ddffd80f5c52..6a4cb71c2498 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -184,31 +184,31 @@ static unsigned int ioapic_id;
struct of_ioapic_type {
u32 out_type;
- u32 trigger;
- u32 polarity;
+ u32 is_level;
+ u32 active_low;
};
static struct of_ioapic_type of_ioapic_type[] =
{
{
- .out_type = IRQ_TYPE_EDGE_RISING,
- .trigger = IOAPIC_EDGE,
- .polarity = 1,
+ .out_type = IRQ_TYPE_EDGE_FALLING,
+ .is_level = 0,
+ .active_low = 1,
},
{
- .out_type = IRQ_TYPE_LEVEL_LOW,
- .trigger = IOAPIC_LEVEL,
- .polarity = 0,
+ .out_type = IRQ_TYPE_LEVEL_HIGH,
+ .is_level = 1,
+ .active_low = 0,
},
{
- .out_type = IRQ_TYPE_LEVEL_HIGH,
- .trigger = IOAPIC_LEVEL,
- .polarity = 1,
+ .out_type = IRQ_TYPE_LEVEL_LOW,
+ .is_level = 1,
+ .active_low = 1,
},
{
- .out_type = IRQ_TYPE_EDGE_FALLING,
- .trigger = IOAPIC_EDGE,
- .polarity = 0,
+ .out_type = IRQ_TYPE_EDGE_RISING,
+ .is_level = 0,
+ .active_low = 0,
},
};
@@ -228,7 +228,7 @@ static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
return -EINVAL;
it = &of_ioapic_type[type_index];
- ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity);
+ ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->is_level, it->active_low);
tmp.devid = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
tmp.ioapic.pin = fwspec->param[0];
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 97aa900386cb..299c20f0a38b 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -183,7 +183,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
}
}
-void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, const char *log_lvl)
{
struct unwind_state state;
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index ac3d5f22fe64..0d54099c2a3a 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -140,16 +140,27 @@ SYM_FUNC_START(ftrace_caller)
/* save_mcount_regs fills in first two parameters */
save_mcount_regs
+ /* Stack - skipping return address of ftrace_caller */
+ leaq MCOUNT_REG_SIZE+8(%rsp), %rcx
+ movq %rcx, RSP(%rsp)
+
SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL)
/* Load the ftrace_ops into the 3rd parameter */
movq function_trace_op(%rip), %rdx
- /* regs go into 4th parameter (but make it NULL) */
- movq $0, %rcx
+ /* regs go into 4th parameter */
+ leaq (%rsp), %rcx
+
+ /* Only ops with REGS flag set should have CS register set */
+ movq $0, CS(%rsp)
SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
call ftrace_stub
+ /* Handlers can change the RIP */
+ movq RIP(%rsp), %rax
+ movq %rax, MCOUNT_REG_SIZE(%rsp)
+
restore_mcount_regs
/*
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 05e117137b45..5e9beb77cafd 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -37,7 +37,6 @@
#include <asm/kasan.h>
#include <asm/fixmap.h>
#include <asm/realmode.h>
-#include <asm/desc.h>
#include <asm/extable.h>
#include <asm/trapnr.h>
#include <asm/sev-es.h>
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3c417734790f..04bddaaba8e2 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -26,15 +26,6 @@
#include <asm/nospec-branch.h>
#include <asm/fixmap.h>
-#ifdef CONFIG_PARAVIRT_XXL
-#include <asm/asm-offsets.h>
-#include <asm/paravirt.h>
-#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg
-#else
-#define INTERRUPT_RETURN iretq
-#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg
-#endif
-
/*
* We are not able to switch in one step to the final KERNEL ADDRESS SPACE
* because we need identity-mapped pages.
@@ -540,21 +531,19 @@ SYM_DATA_END(level3_kernel_pgt)
SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt)
/*
- * 512 MB kernel mapping. We spend a full page on this pagetable
- * anyway.
+ * Kernel high mapping.
*
- * The kernel code+data+bss must not be bigger than that.
+ * The kernel code+data+bss must be located below KERNEL_IMAGE_SIZE in
+ * virtual address space, which is 1 GiB if RANDOMIZE_BASE is enabled,
+ * 512 MiB otherwise.
*
- * (NOTE: at +512MB starts the module area, see MODULES_VADDR.
- * If you want to increase this then increase MODULES_VADDR
- * too.)
+ * (NOTE: after that starts the module area, see MODULES_VADDR.)
*
- * This table is eventually used by the kernel during normal
- * runtime. Care must be taken to clear out undesired bits
- * later, like _PAGE_RW or _PAGE_GLOBAL in some cases.
+ * This table is eventually used by the kernel during normal runtime.
+ * Care must be taken to clear out undesired bits later, like _PAGE_RW
+ * or _PAGE_GLOBAL in some cases.
*/
- PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
- KERNEL_IMAGE_SIZE/PMD_SIZE)
+ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE)
SYM_DATA_END(level2_kernel_pgt)
SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 7a50f0b62a70..08651a4e6aa0 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -7,6 +7,7 @@
#include <linux/cpu.h>
#include <linux/irq.h>
+#include <asm/irq_remapping.h>
#include <asm/hpet.h>
#include <asm/time.h>
@@ -50,7 +51,7 @@ unsigned long hpet_address;
u8 hpet_blockid; /* OS timer block num */
bool hpet_msi_disable;
-#ifdef CONFIG_PCI_MSI
+#ifdef CONFIG_GENERIC_MSI_IRQ
static DEFINE_PER_CPU(struct hpet_channel *, cpu_hpet_channel);
static struct irq_domain *hpet_domain;
#endif
@@ -467,9 +468,8 @@ static void __init hpet_legacy_clockevent_register(struct hpet_channel *hc)
/*
* HPET MSI Support
*/
-#ifdef CONFIG_PCI_MSI
-
-void hpet_msi_unmask(struct irq_data *data)
+#ifdef CONFIG_GENERIC_MSI_IRQ
+static void hpet_msi_unmask(struct irq_data *data)
{
struct hpet_channel *hc = irq_data_get_irq_handler_data(data);
unsigned int cfg;
@@ -479,7 +479,7 @@ void hpet_msi_unmask(struct irq_data *data)
hpet_writel(cfg, HPET_Tn_CFG(hc->num));
}
-void hpet_msi_mask(struct irq_data *data)
+static void hpet_msi_mask(struct irq_data *data)
{
struct hpet_channel *hc = irq_data_get_irq_handler_data(data);
unsigned int cfg;
@@ -489,12 +489,122 @@ void hpet_msi_mask(struct irq_data *data)
hpet_writel(cfg, HPET_Tn_CFG(hc->num));
}
-void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg)
+static void hpet_msi_write(struct hpet_channel *hc, struct msi_msg *msg)
{
hpet_writel(msg->data, HPET_Tn_ROUTE(hc->num));
hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hc->num) + 4);
}
+static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
+}
+
+static struct irq_chip hpet_msi_controller __ro_after_init = {
+ .name = "HPET-MSI",
+ .irq_unmask = hpet_msi_unmask,
+ .irq_mask = hpet_msi_mask,
+ .irq_ack = irq_chip_ack_parent,
+ .irq_set_affinity = msi_domain_set_affinity,
+ .irq_retrigger = irq_chip_retrigger_hierarchy,
+ .irq_write_msi_msg = hpet_msi_write_msg,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+static int hpet_msi_init(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq,
+ irq_hw_number_t hwirq, msi_alloc_info_t *arg)
+{
+ irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
+ irq_domain_set_info(domain, virq, arg->hwirq, info->chip, NULL,
+ handle_edge_irq, arg->data, "edge");
+
+ return 0;
+}
+
+static void hpet_msi_free(struct irq_domain *domain,
+ struct msi_domain_info *info, unsigned int virq)
+{
+ irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
+}
+
+static struct msi_domain_ops hpet_msi_domain_ops = {
+ .msi_init = hpet_msi_init,
+ .msi_free = hpet_msi_free,
+};
+
+static struct msi_domain_info hpet_msi_domain_info = {
+ .ops = &hpet_msi_domain_ops,
+ .chip = &hpet_msi_controller,
+ .flags = MSI_FLAG_USE_DEF_DOM_OPS,
+};
+
+static struct irq_domain *hpet_create_irq_domain(int hpet_id)
+{
+ struct msi_domain_info *domain_info;
+ struct irq_domain *parent, *d;
+ struct fwnode_handle *fn;
+ struct irq_fwspec fwspec;
+
+ if (x86_vector_domain == NULL)
+ return NULL;
+
+ domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL);
+ if (!domain_info)
+ return NULL;
+
+ *domain_info = hpet_msi_domain_info;
+ domain_info->data = (void *)(long)hpet_id;
+
+ fn = irq_domain_alloc_named_id_fwnode(hpet_msi_controller.name,
+ hpet_id);
+ if (!fn) {
+ kfree(domain_info);
+ return NULL;
+ }
+
+ fwspec.fwnode = fn;
+ fwspec.param_count = 1;
+ fwspec.param[0] = hpet_id;
+
+ parent = irq_find_matching_fwspec(&fwspec, DOMAIN_BUS_ANY);
+ if (!parent) {
+ irq_domain_free_fwnode(fn);
+ kfree(domain_info);
+ return NULL;
+ }
+ if (parent != x86_vector_domain)
+ hpet_msi_controller.name = "IR-HPET-MSI";
+
+ d = msi_create_irq_domain(fn, domain_info, parent);
+ if (!d) {
+ irq_domain_free_fwnode(fn);
+ kfree(domain_info);
+ }
+ return d;
+}
+
+static inline int hpet_dev_id(struct irq_domain *domain)
+{
+ struct msi_domain_info *info = msi_get_domain_info(domain);
+
+ return (int)(long)info->data;
+}
+
+static int hpet_assign_irq(struct irq_domain *domain, struct hpet_channel *hc,
+ int dev_num)
+{
+ struct irq_alloc_info info;
+
+ init_irq_alloc_info(&info, NULL);
+ info.type = X86_IRQ_ALLOC_TYPE_HPET;
+ info.data = hc;
+ info.devid = hpet_dev_id(domain);
+ info.hwirq = dev_num;
+
+ return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
+}
+
static int hpet_clkevt_msi_resume(struct clock_event_device *evt)
{
struct hpet_channel *hc = clockevent_to_channel(evt);
diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c
deleted file mode 100644
index 7dfb1e808928..000000000000
--- a/arch/x86/kernel/ima_arch.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0+ */
-/*
- * Copyright (C) 2018 IBM Corporation
- */
-#include <linux/efi.h>
-#include <linux/module.h>
-#include <linux/ima.h>
-
-extern struct boot_params boot_params;
-
-static enum efi_secureboot_mode get_sb_mode(void)
-{
- efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
- efi_status_t status;
- unsigned long size;
- u8 secboot, setupmode;
-
- size = sizeof(secboot);
-
- if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE)) {
- pr_info("ima: secureboot mode unknown, no efi\n");
- return efi_secureboot_mode_unknown;
- }
-
- /* Get variable contents into buffer */
- status = efi.get_variable(L"SecureBoot", &efi_variable_guid,
- NULL, &size, &secboot);
- if (status == EFI_NOT_FOUND) {
- pr_info("ima: secureboot mode disabled\n");
- return efi_secureboot_mode_disabled;
- }
-
- if (status != EFI_SUCCESS) {
- pr_info("ima: secureboot mode unknown\n");
- return efi_secureboot_mode_unknown;
- }
-
- size = sizeof(setupmode);
- status = efi.get_variable(L"SetupMode", &efi_variable_guid,
- NULL, &size, &setupmode);
-
- if (status != EFI_SUCCESS) /* ignore unknown SetupMode */
- setupmode = 0;
-
- if (secboot == 0 || setupmode == 1) {
- pr_info("ima: secureboot mode disabled\n");
- return efi_secureboot_mode_disabled;
- }
-
- pr_info("ima: secureboot mode enabled\n");
- return efi_secureboot_mode_enabled;
-}
-
-bool arch_ima_get_secureboot(void)
-{
- static enum efi_secureboot_mode sb_mode;
- static bool initialized;
-
- if (!initialized && efi_enabled(EFI_BOOT)) {
- sb_mode = boot_params.secure_boot;
-
- if (sb_mode == efi_secureboot_mode_unset)
- sb_mode = get_sb_mode();
- initialized = true;
- }
-
- if (sb_mode == efi_secureboot_mode_enabled)
- return true;
- else
- return false;
-}
-
-/* secureboot arch rules */
-static const char * const sb_arch_rules[] = {
-#if !IS_ENABLED(CONFIG_KEXEC_SIG)
- "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig",
-#endif /* CONFIG_KEXEC_SIG */
- "measure func=KEXEC_KERNEL_CHECK",
-#if !IS_ENABLED(CONFIG_MODULE_SIG)
- "appraise func=MODULE_CHECK appraise_type=imasig",
-#endif
- "measure func=MODULE_CHECK",
- NULL
-};
-
-const char * const *arch_get_ima_policy(void)
-{
- if (IS_ENABLED(CONFIG_IMA_ARCH_POLICY) && arch_ima_get_secureboot()) {
- if (IS_ENABLED(CONFIG_MODULE_SIG))
- set_module_sig_enforced();
- return sb_arch_rules;
- }
- return NULL;
-}
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 547c7abb39f5..a65e9e97857f 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -864,6 +864,7 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs,
p->ainsn.boostable = true;
goto no_change;
}
+ break;
default:
break;
}
@@ -937,6 +938,11 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
* So clear it by resetting the current kprobe:
*/
regs->flags &= ~X86_EFLAGS_TF;
+ /*
+ * Since the single step (trap) has been cancelled,
+ * we need to restore BTF here.
+ */
+ restore_btf();
/*
* If the TF flag was set before the kprobe hit,
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 681a4b36e9bb..373e5fa3ce1f 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -14,15 +14,21 @@
/* Ftrace callback handler for kprobes -- called under preepmt disabed */
void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct pt_regs *regs)
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
+ struct pt_regs *regs = ftrace_get_regs(fregs);
struct kprobe *p;
struct kprobe_ctlblk *kcb;
+ int bit;
- /* Preempt is disabled by ftrace */
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0)
+ return;
+
+ preempt_disable_notrace();
p = get_kprobe((kprobe_opcode_t *)ip);
if (unlikely(!p) || kprobe_disabled(p))
- return;
+ goto out;
kcb = get_kprobe_ctlblk();
if (kprobe_running()) {
@@ -52,6 +58,9 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
*/
__this_cpu_write(current_kprobe, NULL);
}
+out:
+ preempt_enable_notrace();
+ ftrace_test_recursion_unlock(bit);
}
NOKPROBE_SYMBOL(kprobe_ftrace_handler);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 7f57ede3cb8e..5e78e01ca3b4 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -740,6 +740,11 @@ static void __init kvm_apic_init(void)
#endif
}
+static bool __init kvm_msi_ext_dest_id(void)
+{
+ return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID);
+}
+
static void __init kvm_init_platform(void)
{
kvmclock_init();
@@ -769,6 +774,7 @@ const __initconst struct hypervisor_x86 x86_hyper_kvm = {
.type = X86_HYPER_KVM,
.init.guest_late_init = kvm_guest_init,
.init.x2apic_available = kvm_para_available,
+ .init.msi_ext_dest_id = kvm_msi_ext_dest_id,
.init.init_platform = kvm_init_platform,
#if defined(CONFIG_AMD_MEM_ENCRYPT)
.runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare,
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 34b18f6eeb2c..aa593743acf6 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -44,7 +44,6 @@ static int __init parse_no_kvmclock_vsyscall(char *arg)
early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */
-#define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS)
#define HVC_BOOT_ARRAY_SIZE \
(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index c0d409810658..8a67d1fa8dc5 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -99,11 +99,9 @@ static int filter_write(u32 reg)
if (!__ratelimit(&fw_rs))
return 0;
- if (reg == MSR_IA32_ENERGY_PERF_BIAS)
- return 0;
-
- pr_err("Write to unrecognized MSR 0x%x by %s (pid: %d). Please report to [email protected].\n",
- reg, current->comm, current->pid);
+ pr_warn("Write to unrecognized MSR 0x%x by %s (pid: %d).\n",
+ reg, current->comm, current->pid);
+ pr_warn("See https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/about for details.\n");
return 0;
}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 4bc77aaf1303..bf250a339655 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -475,7 +475,7 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7);
DEFINE_IDTENTRY_RAW(exc_nmi)
{
- bool irq_state;
+ irqentry_state_t irq_state;
/*
* Re-enable NMIs right here when running as an SEV-ES guest. This might
@@ -502,14 +502,14 @@ nmi_restart:
this_cpu_write(nmi_dr7, local_db_save());
- irq_state = idtentry_enter_nmi(regs);
+ irq_state = irqentry_nmi_enter(regs);
inc_irq_stat(__nmi_count);
if (!ignore_nmis)
default_do_nmi(regs);
- idtentry_exit_nmi(regs, irq_state);
+ irqentry_nmi_exit(regs, irq_state);
local_db_restore(this_cpu_read(nmi_dr7));
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index f9e5352b3bef..624703af80a1 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -122,7 +122,7 @@ int perf_reg_validate(u64 mask)
u64 perf_reg_abi(struct task_struct *task)
{
- if (test_tsk_thread_flag(task, TIF_IA32))
+ if (!user_64bit_mode(task_pt_regs(task)))
return PERF_SAMPLE_REGS_ABI_32;
else
return PERF_SAMPLE_REGS_ABI_64;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index df342bedea88..ad582f9ac5a6 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -511,11 +511,10 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
EXPORT_SYMBOL_GPL(start_thread);
#ifdef CONFIG_COMPAT
-void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32)
{
start_thread_common(regs, new_ip, new_sp,
- test_thread_flag(TIF_X32)
- ? __USER_CS : __USER32_CS,
+ x32 ? __USER_CS : __USER32_CS,
__USER_DS, __USER_DS);
}
#endif
@@ -641,16 +640,12 @@ void set_personality_64bit(void)
/* inherit personality from parent */
/* Make sure to be in 64bit mode */
- clear_thread_flag(TIF_IA32);
clear_thread_flag(TIF_ADDR32);
- clear_thread_flag(TIF_X32);
/* Pretend that this comes from a 64bit execve */
task_pt_regs(current)->orig_ax = __NR_execve;
current_thread_info()->status &= ~TS_COMPAT;
-
- /* Ensure the corresponding mm is not marked. */
if (current->mm)
- current->mm->context.ia32_compat = 0;
+ current->mm->context.flags = MM_CONTEXT_HAS_VSYSCALL;
/* TBD: overwrites user setup. Should have two bits.
But 64bit processes have always behaved this way,
@@ -662,10 +657,9 @@ void set_personality_64bit(void)
static void __set_personality_x32(void)
{
#ifdef CONFIG_X86_X32
- clear_thread_flag(TIF_IA32);
- set_thread_flag(TIF_X32);
if (current->mm)
- current->mm->context.ia32_compat = TIF_X32;
+ current->mm->context.flags = 0;
+
current->personality &= ~READ_IMPLIES_EXEC;
/*
* in_32bit_syscall() uses the presence of the x32 syscall bit
@@ -683,10 +677,14 @@ static void __set_personality_x32(void)
static void __set_personality_ia32(void)
{
#ifdef CONFIG_IA32_EMULATION
- set_thread_flag(TIF_IA32);
- clear_thread_flag(TIF_X32);
- if (current->mm)
- current->mm->context.ia32_compat = TIF_IA32;
+ if (current->mm) {
+ /*
+ * uprobes applied to this MM need to know this and
+ * cannot use user_64bit_mode() at that time.
+ */
+ current->mm->context.flags = MM_CONTEXT_UPROBE_IA32;
+ }
+
current->personality |= force_personality32;
/* Prepare the first "return" to user space */
task_pt_regs(current)->orig_ax = __NR_ia32_execve;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 84f581c91db4..740f3bdb3f61 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -119,11 +119,6 @@ EXPORT_SYMBOL(boot_cpu_data);
unsigned int def_to_bigsmp;
-/* For MCA, but anyone else can use it if they want */
-unsigned int machine_id;
-unsigned int machine_submodel_id;
-unsigned int BIOS_revision;
-
struct apm_info apm_info;
EXPORT_SYMBOL(apm_info);
@@ -1054,6 +1049,12 @@ void __init setup_arch(char **cmdline_p)
memblock_set_current_limit(ISA_END_ADDRESS);
e820__memblock_setup();
+ /*
+ * Needs to run after memblock setup because it needs the physical
+ * memory size.
+ */
+ sev_setup_arch();
+
reserve_bios_regions();
efi_fake_memmap();
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index be0d7d4152ec..ea794a083c44 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -804,11 +804,11 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs)
* want to handle. Thus you cannot kill init even with a SIGKILL even by
* mistake.
*/
-void arch_do_signal(struct pt_regs *regs)
+void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal)
{
struct ksignal ksig;
- if (get_signal(&ksig)) {
+ if (has_signal && get_signal(&ksig)) {
/* Whee! Actually deliver the signal. */
handle_signal(&ksig, regs);
return;
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index a7f3e12cfbdb..a5330ff498f0 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -31,7 +31,7 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(NSIGBUS != 5);
BUILD_BUG_ON(NSIGTRAP != 5);
BUILD_BUG_ON(NSIGCHLD != 6);
- BUILD_BUG_ON(NSIGSYS != 1);
+ BUILD_BUG_ON(NSIGSYS != 2);
/* This is part of the ABI and can never change in size: */
BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128);
@@ -165,16 +165,9 @@ void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
{
signal_compat_build_tests();
- /* Don't leak in-kernel non-uapi flags to user-space */
- if (oact)
- oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
-
if (!act)
return;
- /* Don't let flags to be set from userspace */
- act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
-
if (in_ia32_syscall())
act->sa.sa_flags |= SA_IA32_ABI;
if (in_x32_syscall())
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index de776b2e6046..8ca66af96a54 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -82,6 +82,10 @@
#include <asm/hw_irq.h>
#include <asm/stackprotector.h>
+#ifdef CONFIG_ACPI_CPPC_LIB
+#include <acpi/cppc_acpi.h>
+#endif
+
/* representing HT siblings of each logical CPU */
DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
@@ -148,7 +152,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
}
-static void init_freq_invariance(bool secondary);
+static void init_freq_invariance(bool secondary, bool cppc_ready);
/*
* Report back to the Boot Processor during boot time or to the caller processor
@@ -186,7 +190,7 @@ static void smp_callin(void)
*/
set_cpu_sibling_map(raw_smp_processor_id());
- init_freq_invariance(true);
+ init_freq_invariance(true, false);
/*
* Get our bogomips.
@@ -229,6 +233,7 @@ static void notrace start_secondary(void *unused)
#endif
cpu_init_exception_handling();
cpu_init();
+ rcu_cpu_starting(raw_smp_processor_id());
x86_cpuinit.early_percpu_clock_init();
preempt_disable();
smp_callin();
@@ -747,13 +752,14 @@ static void __init smp_quirk_init_udelay(void)
int
wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
{
+ u32 dm = apic->dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
unsigned long send_status, accept_status = 0;
int maxlvt;
/* Target chip */
/* Boot on the stack */
/* Kick the second */
- apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid);
+ apic_icr_write(APIC_DM_NMI | dm, apicid);
pr_debug("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
@@ -980,10 +986,7 @@ wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
if (!boot_error) {
enable_start_cpu0 = 1;
*cpu0_nmi_registered = 1;
- if (apic->dest_logical == APIC_DEST_LOGICAL)
- id = cpu0_logical_apicid;
- else
- id = apicid;
+ id = apic->dest_mode_logical ? cpu0_logical_apicid : apicid;
boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip);
}
@@ -1340,7 +1343,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
set_sched_topology(x86_topology);
set_cpu_sibling_map(0);
- init_freq_invariance(false);
+ init_freq_invariance(false, false);
smp_sanity_check();
switch (apic_intr_mode) {
@@ -2027,6 +2030,48 @@ out:
return true;
}
+#ifdef CONFIG_ACPI_CPPC_LIB
+static bool amd_set_max_freq_ratio(void)
+{
+ struct cppc_perf_caps perf_caps;
+ u64 highest_perf, nominal_perf;
+ u64 perf_ratio;
+ int rc;
+
+ rc = cppc_get_perf_caps(0, &perf_caps);
+ if (rc) {
+ pr_debug("Could not retrieve perf counters (%d)\n", rc);
+ return false;
+ }
+
+ highest_perf = perf_caps.highest_perf;
+ nominal_perf = perf_caps.nominal_perf;
+
+ if (!highest_perf || !nominal_perf) {
+ pr_debug("Could not retrieve highest or nominal performance\n");
+ return false;
+ }
+
+ perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf);
+ /* midpoint between max_boost and max_P */
+ perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
+ if (!perf_ratio) {
+ pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n");
+ return false;
+ }
+
+ arch_turbo_freq_ratio = perf_ratio;
+ arch_set_max_freq_ratio(false);
+
+ return true;
+}
+#else
+static bool amd_set_max_freq_ratio(void)
+{
+ return false;
+}
+#endif
+
static void init_counter_refs(void)
{
u64 aperf, mperf;
@@ -2038,7 +2083,7 @@ static void init_counter_refs(void)
this_cpu_write(arch_prev_mperf, mperf);
}
-static void init_freq_invariance(bool secondary)
+static void init_freq_invariance(bool secondary, bool cppc_ready)
{
bool ret = false;
@@ -2054,15 +2099,38 @@ static void init_freq_invariance(bool secondary)
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
ret = intel_set_max_freq_ratio();
+ else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+ if (!cppc_ready) {
+ return;
+ }
+ ret = amd_set_max_freq_ratio();
+ }
if (ret) {
init_counter_refs();
static_branch_enable(&arch_scale_freq_key);
+ pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
} else {
pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
}
}
+#ifdef CONFIG_ACPI_CPPC_LIB
+static DEFINE_MUTEX(freq_invariance_lock);
+
+void init_freq_invariance_cppc(void)
+{
+ static bool secondary;
+
+ mutex_lock(&freq_invariance_lock);
+
+ init_freq_invariance(secondary, true);
+ secondary = true;
+
+ mutex_unlock(&freq_invariance_lock);
+}
+#endif
+
static void disable_freq_invariance_workfn(struct work_struct *work)
{
static_branch_disable(&arch_scale_freq_key);
@@ -2112,7 +2180,7 @@ error:
schedule_work(&disable_freq_invariance_work);
}
#else
-static inline void init_freq_invariance(bool secondary)
+static inline void init_freq_invariance(bool secondary, bool cppc_ready)
{
}
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index ae64f98ec2ab..4c09ba110204 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -93,6 +93,7 @@ static struct mm_struct tboot_mm = {
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
+ .write_protect_seq = SEQCNT_ZERO(tboot_mm.write_protect_seq),
MMAP_LOCK_INITIALIZER(init_mm)
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 0a2ec801b63f..f5477eab5692 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -25,6 +25,7 @@
*
* Send feedback to <[email protected]>
*/
+#include <linux/interrupt.h>
#include <linux/nodemask.h>
#include <linux/export.h>
#include <linux/mmzone.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index e19df6cde35d..7f5aec758f0e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -60,6 +60,7 @@
#include <asm/umip.h>
#include <asm/insn.h>
#include <asm/insn-eval.h>
+#include <asm/vdso.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -117,6 +118,9 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = trapnr;
die(str, regs, error_code);
+ } else {
+ if (fixup_vdso_exception(regs, trapnr, error_code, 0))
+ return 0;
}
/*
@@ -299,11 +303,12 @@ DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check)
local_irq_enable();
if (handle_user_split_lock(regs, error_code))
- return;
+ goto out;
do_trap(X86_TRAP_AC, SIGBUS, "alignment check", regs,
error_code, BUS_ADRALN, NULL);
+out:
local_irq_disable();
}
@@ -405,7 +410,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
}
#endif
- idtentry_enter_nmi(regs);
+ irqentry_nmi_enter(regs);
instrumentation_begin();
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
@@ -550,6 +555,9 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
+ if (fixup_vdso_exception(regs, X86_TRAP_GP, error_code, 0))
+ return;
+
show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
force_sig(SIGSEGV);
goto exit;
@@ -651,12 +659,13 @@ DEFINE_IDTENTRY_RAW(exc_int3)
instrumentation_end();
irqentry_exit_to_user_mode(regs);
} else {
- bool irq_state = idtentry_enter_nmi(regs);
+ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
+
instrumentation_begin();
if (!do_int3(regs))
die("int3", regs, 0);
instrumentation_end();
- idtentry_exit_nmi(regs, irq_state);
+ irqentry_nmi_exit(regs, irq_state);
}
}
@@ -851,7 +860,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
* includes the entry stack is excluded for everything.
*/
unsigned long dr7 = local_db_save();
- bool irq_state = idtentry_enter_nmi(regs);
+ irqentry_state_t irq_state = irqentry_nmi_enter(regs);
instrumentation_begin();
/*
@@ -908,7 +917,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
regs->flags &= ~X86_EFLAGS_TF;
out:
instrumentation_end();
- idtentry_exit_nmi(regs, irq_state);
+ irqentry_nmi_exit(regs, irq_state);
local_db_restore(dr7);
}
@@ -926,7 +935,7 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
/*
* NB: We can't easily clear DR7 here because
- * idtentry_exit_to_usermode() can invoke ptrace, schedule, access
+ * irqentry_exit_to_usermode() can invoke ptrace, schedule, access
* user memory, etc. This means that a recursive #DB is possible. If
* this happens, that #DB will hit exc_debug_kernel() and clear DR7.
* Since we're not on the IST stack right now, everything will be
@@ -1048,6 +1057,9 @@ static void math_error(struct pt_regs *regs, int trapnr)
if (!si_code)
goto exit;
+ if (fixup_vdso_exception(regs, trapnr, 0, 0))
+ return;
+
force_sig_fault(SIGFPE, si_code,
(void __user *)uprobe_get_trap_addr(regs));
exit:
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 138bdb1fd136..a2b413394917 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -1017,6 +1017,8 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
if (uprobe_post_sstep_notifier(regs))
ret = NOTIFY_STOP;
+ break;
+
default:
break;
}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index bf9e0adb5b7e..efd9e9ea17f2 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -454,13 +454,13 @@ SECTIONS
ASSERT(SIZEOF(.rela.dyn) == 0, "Unexpected run-time relocations (.rela) detected!")
}
-#ifdef CONFIG_X86_32
/*
* The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
*/
. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
"kernel image bigger than KERNEL_IMAGE_SIZE");
-#else
+
+#ifdef CONFIG_X86_64
/*
* Per-cpu symbols which need to be offset from __per_cpu_load
* for the boot processor.
@@ -470,18 +470,12 @@ INIT_PER_CPU(gdt_page);
INIT_PER_CPU(fixed_percpu_data);
INIT_PER_CPU(irq_stack_backing_store);
-/*
- * Build-time check on the image size:
- */
-. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
- "kernel image bigger than KERNEL_IMAGE_SIZE");
-
#ifdef CONFIG_SMP
. = ASSERT((fixed_percpu_data == 0),
"fixed_percpu_data is not at start of per-cpu area");
#endif
-#endif /* CONFIG_X86_32 */
+#endif /* CONFIG_X86_64 */
#ifdef CONFIG_KEXEC_CORE
#include <asm/kexec.h>
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index a3038d8deb6a..8b395821cb8d 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -110,6 +110,7 @@ struct x86_init_ops x86_init __initdata = {
.init_platform = x86_init_noop,
.guest_late_init = x86_init_noop,
.x2apic_available = bool_x86_init_noop,
+ .msi_ext_dest_id = bool_x86_init_noop,
.init_mem_mapping = x86_init_noop,
.init_after_bootmem = x86_init_noop,
},