diff options
Diffstat (limited to 'drivers/acpi/apei')
| -rw-r--r-- | drivers/acpi/apei/apei-base.c | 2 | ||||
| -rw-r--r-- | drivers/acpi/apei/einj.c | 56 | ||||
| -rw-r--r-- | drivers/acpi/apei/ghes.c | 130 |
3 files changed, 123 insertions, 65 deletions
diff --git a/drivers/acpi/apei/apei-base.c b/drivers/acpi/apei/apei-base.c index 9b52482b4ed5..c7c26872f4ce 100644 --- a/drivers/acpi/apei/apei-base.c +++ b/drivers/acpi/apei/apei-base.c @@ -25,9 +25,9 @@ #include <linux/slab.h> #include <linux/io.h> #include <linux/kref.h> -#include <linux/rculist.h> #include <linux/interrupt.h> #include <linux/debugfs.h> +#include <acpi/apei.h> #include <asm/unaligned.h> #include "apei-internal.h" diff --git a/drivers/acpi/apei/einj.c b/drivers/acpi/apei/einj.c index 6b583373c58a..ab86b2f4e719 100644 --- a/drivers/acpi/apei/einj.c +++ b/drivers/acpi/apei/einj.c @@ -358,6 +358,7 @@ static int __einj_error_trigger(u64 trigger_paddr, u32 type, */ if ((param_extension || acpi5) && (type & MEM_ERROR_MASK) && param2) { struct apei_resources addr_resources; + apei_resources_init(&addr_resources); trigger_param_region = einj_get_trigger_parameter_region( trigger_tab, param1, param2); @@ -432,11 +433,11 @@ static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, } v5param->flags = vendor_flags; } else if (flags) { - v5param->flags = flags; - v5param->memory_address = param1; - v5param->memory_address_range = param2; - v5param->apicid = param3; - v5param->pcie_sbdf = param4; + v5param->flags = flags; + v5param->memory_address = param1; + v5param->memory_address_range = param2; + v5param->apicid = param3; + v5param->pcie_sbdf = param4; } else { switch (type) { case ACPI_EINJ_PROCESSOR_CORRECTABLE: @@ -466,6 +467,7 @@ static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2, return rc; if (einj_param) { struct einj_parameter *v4param = einj_param; + v4param->param1 = param1; v4param->param2 = param2; } @@ -569,6 +571,20 @@ static u64 error_param2; static u64 error_param3; static u64 error_param4; static struct dentry *einj_debug_dir; +static const char * const einj_error_type_string[] = { + "0x00000001\tProcessor Correctable\n", + "0x00000002\tProcessor Uncorrectable non-fatal\n", + "0x00000004\tProcessor Uncorrectable fatal\n", + "0x00000008\tMemory Correctable\n", + "0x00000010\tMemory Uncorrectable non-fatal\n", + "0x00000020\tMemory Uncorrectable fatal\n", + "0x00000040\tPCI Express Correctable\n", + "0x00000080\tPCI Express Uncorrectable non-fatal\n", + "0x00000100\tPCI Express Uncorrectable fatal\n", + "0x00000200\tPlatform Correctable\n", + "0x00000400\tPlatform Uncorrectable non-fatal\n", + "0x00000800\tPlatform Uncorrectable fatal\n", +}; static int available_error_type_show(struct seq_file *m, void *v) { @@ -578,30 +594,9 @@ static int available_error_type_show(struct seq_file *m, void *v) rc = einj_get_available_error_type(&available_error_type); if (rc) return rc; - if (available_error_type & 0x0001) - seq_printf(m, "0x00000001\tProcessor Correctable\n"); - if (available_error_type & 0x0002) - seq_printf(m, "0x00000002\tProcessor Uncorrectable non-fatal\n"); - if (available_error_type & 0x0004) - seq_printf(m, "0x00000004\tProcessor Uncorrectable fatal\n"); - if (available_error_type & 0x0008) - seq_printf(m, "0x00000008\tMemory Correctable\n"); - if (available_error_type & 0x0010) - seq_printf(m, "0x00000010\tMemory Uncorrectable non-fatal\n"); - if (available_error_type & 0x0020) - seq_printf(m, "0x00000020\tMemory Uncorrectable fatal\n"); - if (available_error_type & 0x0040) - seq_printf(m, "0x00000040\tPCI Express Correctable\n"); - if (available_error_type & 0x0080) - seq_printf(m, "0x00000080\tPCI Express Uncorrectable non-fatal\n"); - if (available_error_type & 0x0100) - seq_printf(m, "0x00000100\tPCI Express Uncorrectable fatal\n"); - if (available_error_type & 0x0200) - seq_printf(m, "0x00000200\tPlatform Correctable\n"); - if (available_error_type & 0x0400) - seq_printf(m, "0x00000400\tPlatform Uncorrectable non-fatal\n"); - if (available_error_type & 0x0800) - seq_printf(m, "0x00000800\tPlatform Uncorrectable fatal\n"); + for (int pos = 0; pos < ARRAY_SIZE(einj_error_type_string); pos++) + if (available_error_type & BIT(pos)) + seq_puts(m, einj_error_type_string[pos]); return 0; } @@ -689,8 +684,7 @@ static int __init einj_init(void) if (status == AE_NOT_FOUND) { pr_warn("EINJ table not found.\n"); return -ENODEV; - } - else if (ACPI_FAILURE(status)) { + } else if (ACPI_FAILURE(status)) { pr_err("Failed to get EINJ table: %s\n", acpi_format_exception(status)); return -EINVAL; diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 80ad530583c9..066dc1f5c235 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -94,6 +94,8 @@ #define FIX_APEI_GHES_SDEI_CRITICAL __end_of_fixed_addresses #endif +static ATOMIC_NOTIFIER_HEAD(ghes_report_chain); + static inline bool is_hest_type_generic_v2(struct ghes *ghes) { return ghes->generic->header.type == ACPI_HEST_TYPE_GENERIC_ERROR_V2; @@ -108,6 +110,13 @@ bool ghes_disable; module_param_named(disable, ghes_disable, bool, 0); /* + * "ghes.edac_force_enable" forcibly enables ghes_edac and skips the platform + * check. + */ +static bool ghes_edac_force_enable; +module_param_named(edac_force_enable, ghes_edac_force_enable, bool, 0); + +/* * All error sources notified with HED (Hardware Error Device) share a * single notifier callback, so they need to be linked and checked one * by one. This holds true for NMI too. @@ -119,6 +128,13 @@ static LIST_HEAD(ghes_hed); static DEFINE_MUTEX(ghes_list_mutex); /* + * A list of GHES devices which are given to the corresponding EDAC driver + * ghes_edac for further use. + */ +static LIST_HEAD(ghes_devs); +static DEFINE_MUTEX(ghes_devs_mutex); + +/* * Because the memory area used to transfer hardware error information * from BIOS to Linux can be determined only in NMI, IRQ or timer * handler, but general ioremap can not be used in atomic context, so @@ -138,7 +154,7 @@ struct ghes_vendor_record_entry { static struct gen_pool *ghes_estatus_pool; static unsigned long ghes_estatus_pool_size_request; -static struct ghes_estatus_cache *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE]; +static struct ghes_estatus_cache __rcu *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE]; static atomic_t ghes_estatus_cache_alloced; static int ghes_panic_timeout __read_mostly = 30; @@ -163,7 +179,7 @@ static void ghes_unmap(void __iomem *vaddr, enum fixed_addresses fixmap_idx) clear_fixmap(fixmap_idx); } -int ghes_estatus_pool_init(int num_ghes) +int ghes_estatus_pool_init(unsigned int num_ghes) { unsigned long addr, len; int rc; @@ -645,7 +661,7 @@ static bool ghes_do_proc(struct ghes *ghes, if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) { struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); - ghes_edac_report_mem_error(sev, mem_err); + atomic_notifier_call_chain(&ghes_report_chain, sev, mem_err); arch_apei_report_mem_error(sev, mem_err); queued = ghes_handle_memory_failure(gdata, sev); @@ -773,48 +789,42 @@ static struct ghes_estatus_cache *ghes_estatus_cache_alloc( return cache; } -static void ghes_estatus_cache_free(struct ghes_estatus_cache *cache) +static void ghes_estatus_cache_rcu_free(struct rcu_head *head) { + struct ghes_estatus_cache *cache; u32 len; + cache = container_of(head, struct ghes_estatus_cache, rcu); len = cper_estatus_len(GHES_ESTATUS_FROM_CACHE(cache)); len = GHES_ESTATUS_CACHE_LEN(len); gen_pool_free(ghes_estatus_pool, (unsigned long)cache, len); atomic_dec(&ghes_estatus_cache_alloced); } -static void ghes_estatus_cache_rcu_free(struct rcu_head *head) +static void +ghes_estatus_cache_add(struct acpi_hest_generic *generic, + struct acpi_hest_generic_status *estatus) { - struct ghes_estatus_cache *cache; - - cache = container_of(head, struct ghes_estatus_cache, rcu); - ghes_estatus_cache_free(cache); -} - -static void ghes_estatus_cache_add( - struct acpi_hest_generic *generic, - struct acpi_hest_generic_status *estatus) -{ - int i, slot = -1, count; unsigned long long now, duration, period, max_period = 0; - struct ghes_estatus_cache *cache, *slot_cache = NULL, *new_cache; + struct ghes_estatus_cache *cache, *new_cache; + struct ghes_estatus_cache __rcu *victim; + int i, slot = -1, count; new_cache = ghes_estatus_cache_alloc(generic, estatus); - if (new_cache == NULL) + if (!new_cache) return; + rcu_read_lock(); now = sched_clock(); for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) { cache = rcu_dereference(ghes_estatus_caches[i]); if (cache == NULL) { slot = i; - slot_cache = NULL; break; } duration = now - cache->time_in; if (duration >= GHES_ESTATUS_IN_CACHE_MAX_NSEC) { slot = i; - slot_cache = cache; break; } count = atomic_read(&cache->count); @@ -823,18 +833,30 @@ static void ghes_estatus_cache_add( if (period > max_period) { max_period = period; slot = i; - slot_cache = cache; } } - /* new_cache must be put into array after its contents are written */ - smp_wmb(); - if (slot != -1 && cmpxchg(ghes_estatus_caches + slot, - slot_cache, new_cache) == slot_cache) { - if (slot_cache) - call_rcu(&slot_cache->rcu, ghes_estatus_cache_rcu_free); - } else - ghes_estatus_cache_free(new_cache); rcu_read_unlock(); + + if (slot != -1) { + /* + * Use release semantics to ensure that ghes_estatus_cached() + * running on another CPU will see the updated cache fields if + * it can see the new value of the pointer. + */ + victim = xchg_release(&ghes_estatus_caches[slot], + RCU_INITIALIZER(new_cache)); + + /* + * At this point, victim may point to a cached item different + * from the one based on which we selected the slot. Instead of + * going to the loop again to pick another slot, let's just + * drop the other item anyway: this may cause a false cache + * miss later on, but that won't cause any problems. + */ + if (victim) + call_rcu(&unrcu_pointer(victim)->rcu, + ghes_estatus_cache_rcu_free); + } } static void __ghes_panic(struct ghes *ghes, @@ -1376,7 +1398,11 @@ static int ghes_probe(struct platform_device *ghes_dev) platform_set_drvdata(ghes_dev, ghes); - ghes_edac_register(ghes, &ghes_dev->dev); + ghes->dev = &ghes_dev->dev; + + mutex_lock(&ghes_devs_mutex); + list_add_tail(&ghes->elist, &ghes_devs); + mutex_unlock(&ghes_devs_mutex); /* Handle any pending errors right away */ spin_lock_irqsave(&ghes_notify_lock_irq, flags); @@ -1440,12 +1466,12 @@ static int ghes_remove(struct platform_device *ghes_dev) ghes_fini(ghes); - ghes_edac_unregister(ghes); + mutex_lock(&ghes_devs_mutex); + list_del(&ghes->elist); + mutex_unlock(&ghes_devs_mutex); kfree(ghes); - platform_set_drvdata(ghes_dev, NULL); - return 0; } @@ -1497,3 +1523,41 @@ void __init acpi_ghes_init(void) else pr_info(GHES_PFX "Failed to enable APEI firmware first mode.\n"); } + +/* + * Known x86 systems that prefer GHES error reporting: + */ +static struct acpi_platform_list plat_list[] = { + {"HPE ", "Server ", 0, ACPI_SIG_FADT, all_versions}, + { } /* End */ +}; + +struct list_head *ghes_get_devices(void) +{ + int idx = -1; + + if (IS_ENABLED(CONFIG_X86)) { + idx = acpi_match_platform_list(plat_list); + if (idx < 0) { + if (!ghes_edac_force_enable) + return NULL; + + pr_warn_once("Force-loading ghes_edac on an unsupported platform. You're on your own!\n"); + } + } + + return &ghes_devs; +} +EXPORT_SYMBOL_GPL(ghes_get_devices); + +void ghes_register_report_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_register(&ghes_report_chain, nb); +} +EXPORT_SYMBOL_GPL(ghes_register_report_chain); + +void ghes_unregister_report_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_unregister(&ghes_report_chain, nb); +} +EXPORT_SYMBOL_GPL(ghes_unregister_report_chain); |