- Specify what error addresses reported on AMD are actually usable

memory error addresses for further decoding
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmU7kb8ACgkQEsHwGGHe
 VUq3/A//VTrsON+RRS+M7PVewXMiTbwjVytum/9gWXtuUBEFdWCQjCe4TSaI6+mX
 v8inAomBE7s3SoQYkosF1VO2l0r68aJLOm6hczzbjz+ZjGvramDiv5qCs0iMM8m4
 Nvwyjeo1+2G6JeaX2rR7fqnZkA4NcYE1/s05pksNEaXMsAhpSOWenRgUK1EyQXLE
 y1u63G5GLMT4cpjEmEcbp9Lb02WwQzB9inZ1f4MFoujkI5VJ/9b68D+DpGwHd4Ag
 HNrg6LR/YpVwioVnsa+xEiQSxxwuRCHvS8kbc27d3qhfT4cRhmtAIsHYkyeO75TJ
 jkXU1Gme/k2RDEYHOz7heSVWgmG3y9/swc3UZJFE0QAnjdajY7mUsM9+o5uCm4Y6
 rALf8z7t0+oMpG1YML5Y+0wvgcPk9pih6Mm9tbBlFCXPi2OQ5bieNkHe7RQXHcQx
 xwFoQI0ByWvW7omu+jqA8iN4YSLaQhST2wzghPF1Wu7KAewu5lpU+9kmgmL7utme
 aHIQFdhRFusYEABqlr8XQSew5FMtIfOeWWCdgWHghUQp6LCsA0QeUxcQR9VdY9Th
 IgY1j4G2lQeLpDlnWE9VPMCWk4cuTABRyVWEu1B4wScU3xRWD8jOh+LcF76RdoYz
 k7GW0d68DGDCRgU7q86LNM/bNH0zyIIteTj64uHBzQm0ygJcdZU=
 =NcVX
 -----END PGP SIGNATURE-----

Merge tag 'ras_core_for_6.7_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 RAS updates from Borislav Petkov:

 - Specify what error addresses reported on AMD are actually usable
   memory error addresses for further decoding

* tag 'ras_core_for_6.7_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mce: Cleanup mce_usable_address()
  x86/mce: Define amd_mce_usable_address()
  x86/MCE/AMD: Split amd_mce_is_memory_error()
This commit is contained in:
Linus Torvalds 2023-10-30 11:47:03 -10:00
commit 01ae815c50
5 changed files with 100 additions and 28 deletions

View file

@ -245,7 +245,7 @@ static inline void cmci_recheck(void) {}
int mce_available(struct cpuinfo_x86 *c);
bool mce_is_memory_error(struct mce *m);
bool mce_is_correctable(struct mce *m);
int mce_usable_address(struct mce *m);
bool mce_usable_address(struct mce *m);
DECLARE_PER_CPU(unsigned, mce_exception_count);
DECLARE_PER_CPU(unsigned, mce_poll_count);

View file

@ -713,17 +713,75 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
deferred_error_interrupt_enable(c);
}
bool amd_mce_is_memory_error(struct mce *m)
/*
* DRAM ECC errors are reported in the Northbridge (bank 4) with
* Extended Error Code 8.
*/
static bool legacy_mce_is_memory_error(struct mce *m)
{
return m->bank == 4 && XEC(m->status, 0x1f) == 8;
}
/*
* DRAM ECC errors are reported in Unified Memory Controllers with
* Extended Error Code 0.
*/
static bool smca_mce_is_memory_error(struct mce *m)
{
enum smca_bank_types bank_type;
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;
if (XEC(m->status, 0x3f))
return false;
bank_type = smca_get_bank_type(m->extcpu, m->bank);
if (mce_flags.smca)
return (bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) && xec == 0x0;
return m->bank == 4 && xec == 0x8;
return bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2;
}
bool amd_mce_is_memory_error(struct mce *m)
{
if (mce_flags.smca)
return smca_mce_is_memory_error(m);
else
return legacy_mce_is_memory_error(m);
}
/*
* AMD systems do not have an explicit indicator that the value in MCA_ADDR is
* a system physical address. Therefore, individual cases need to be detected.
* Future cases and checks will be added as needed.
*
* 1) General case
* a) Assume address is not usable.
* 2) Poison errors
* a) Indicated by MCA_STATUS[43]: poison. Defined for all banks except legacy
* northbridge (bank 4).
* b) Refers to poison consumption in the core. Does not include "no action",
* "action optional", or "deferred" error severities.
* c) Will include a usable address so that immediate action can be taken.
* 3) Northbridge DRAM ECC errors
* a) Reported in legacy bank 4 with extended error code (XEC) 8.
* b) MCA_STATUS[43] is *not* defined as poison in legacy bank 4. Therefore,
* this bit should not be checked.
*
* NOTE: SMCA UMC memory errors fall into case #1.
*/
bool amd_mce_usable_address(struct mce *m)
{
/* Check special northbridge case 3) first. */
if (!mce_flags.smca) {
if (legacy_mce_is_memory_error(m))
return true;
else if (m->bank == 4)
return false;
}
/* Check poison bit for all other bank types. */
if (m->status & MCI_STATUS_POISON)
return true;
/* Assume address is not usable for all others. */
return false;
}
static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)

View file

@ -453,32 +453,22 @@ static void mce_irq_work_cb(struct irq_work *entry)
mce_schedule_work();
}
/*
* Check if the address reported by the CPU is in a format we can parse.
* It would be possible to add code for most other cases, but all would
* be somewhat complicated (e.g. segment offset would require an instruction
* parser). So only support physical addresses up to page granularity for now.
*/
int mce_usable_address(struct mce *m)
bool mce_usable_address(struct mce *m)
{
if (!(m->status & MCI_STATUS_ADDRV))
return 0;
return false;
/* Checks after this one are Intel/Zhaoxin-specific: */
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
return 1;
switch (m->cpuvendor) {
case X86_VENDOR_AMD:
return amd_mce_usable_address(m);
if (!(m->status & MCI_STATUS_MISCV))
return 0;
case X86_VENDOR_INTEL:
case X86_VENDOR_ZHAOXIN:
return intel_mce_usable_address(m);
if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
return 0;
if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
return 0;
return 1;
default:
return true;
}
}
EXPORT_SYMBOL_GPL(mce_usable_address);

View file

@ -536,3 +536,23 @@ bool intel_filter_mce(struct mce *m)
return false;
}
/*
* Check if the address reported by the CPU is in a format we can parse.
* It would be possible to add code for most other cases, but all would
* be somewhat complicated (e.g. segment offset would require an instruction
* parser). So only support physical addresses up to page granularity for now.
*/
bool intel_mce_usable_address(struct mce *m)
{
if (!(m->status & MCI_STATUS_MISCV))
return false;
if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
return false;
if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
return false;
return true;
}

View file

@ -49,6 +49,7 @@ void intel_init_cmci(void);
void intel_init_lmce(void);
void intel_clear_lmce(void);
bool intel_filter_mce(struct mce *m);
bool intel_mce_usable_address(struct mce *m);
#else
# define cmci_intel_adjust_timer mce_adjust_timer_default
static inline bool mce_intel_cmci_poll(void) { return false; }
@ -58,6 +59,7 @@ static inline void intel_init_cmci(void) { }
static inline void intel_init_lmce(void) { }
static inline void intel_clear_lmce(void) { }
static inline bool intel_filter_mce(struct mce *m) { return false; }
static inline bool intel_mce_usable_address(struct mce *m) { return false; }
#endif
void mce_timer_kick(unsigned long interval);
@ -210,6 +212,7 @@ extern bool filter_mce(struct mce *m);
#ifdef CONFIG_X86_MCE_AMD
extern bool amd_filter_mce(struct mce *m);
bool amd_mce_usable_address(struct mce *m);
/*
* If MCA_CONFIG[McaLsbInStatusSupported] is set, extract ErrAddr in bits
@ -237,6 +240,7 @@ static __always_inline void smca_extract_err_addr(struct mce *m)
#else
static inline bool amd_filter_mce(struct mce *m) { return false; }
static inline bool amd_mce_usable_address(struct mce *m) { return false; }
static inline void smca_extract_err_addr(struct mce *m) { }
#endif